Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +1539 -38
src/streamlit_app.py
CHANGED
|
@@ -1,40 +1,1541 @@
|
|
| 1 |
-
import
|
| 2 |
-
import
|
| 3 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
""
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
|
| 10 |
-
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 11 |
-
forums](https://discuss.streamlit.io).
|
| 12 |
-
|
| 13 |
-
In the meantime, below is an example of what you can do with just a few lines of code:
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
|
| 17 |
-
num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
|
| 18 |
-
|
| 19 |
-
indices = np.linspace(0, 1, num_points)
|
| 20 |
-
theta = 2 * np.pi * num_turns * indices
|
| 21 |
-
radius = indices
|
| 22 |
-
|
| 23 |
-
x = radius * np.cos(theta)
|
| 24 |
-
y = radius * np.sin(theta)
|
| 25 |
-
|
| 26 |
-
df = pd.DataFrame({
|
| 27 |
-
"x": x,
|
| 28 |
-
"y": y,
|
| 29 |
-
"idx": indices,
|
| 30 |
-
"rand": np.random.randn(num_points),
|
| 31 |
-
})
|
| 32 |
-
|
| 33 |
-
st.altair_chart(alt.Chart(df, height=700, width=700)
|
| 34 |
-
.mark_point(filled=True)
|
| 35 |
-
.encode(
|
| 36 |
-
x=alt.X("x", axis=None),
|
| 37 |
-
y=alt.Y("y", axis=None),
|
| 38 |
-
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
-
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
-
))
|
|
|
|
| 1 |
+
# import os
|
| 2 |
+
# import requests
|
| 3 |
+
# import arxiv
|
| 4 |
+
# from pathlib import Path
|
| 5 |
+
# from typing import List, Dict, Optional
|
| 6 |
+
# import streamlit as st
|
| 7 |
+
# from llama_index.core import (
|
| 8 |
+
# VectorStoreIndex,
|
| 9 |
+
# SimpleDirectoryReader,
|
| 10 |
+
# Settings,
|
| 11 |
+
# Document,
|
| 12 |
+
# StorageContext,
|
| 13 |
+
# load_index_from_storage
|
| 14 |
+
# )
|
| 15 |
+
# from llama_index.core.node_parser import SentenceSplitter
|
| 16 |
+
# from llama_index.llms.groq import Groq
|
| 17 |
+
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 18 |
+
# from llama_index.core.query_engine import RetrieverQueryEngine
|
| 19 |
+
# from llama_index.core.retrievers import VectorIndexRetriever
|
| 20 |
+
# from llama_index.core.response_synthesizers import get_response_synthesizer
|
| 21 |
+
# from llama_index.core.memory import ChatMemoryBuffer
|
| 22 |
+
# from llama_index.core.chat_engine import CondensePlusContextChatEngine
|
| 23 |
+
# import logging
|
| 24 |
+
# import hashlib
|
| 25 |
+
# import json
|
| 26 |
+
# import time
|
| 27 |
+
# from datetime import datetime
|
| 28 |
+
|
| 29 |
+
# # Configure logging
|
| 30 |
+
# logging.basicConfig(level=logging.INFO)
|
| 31 |
+
# logger = logging.getLogger(__name__)
|
| 32 |
+
|
| 33 |
+
# class AcademicPaperQA:
|
| 34 |
+
# def __init__(self, model_name="llama3-70b-8192", groq_api_key=None):
|
| 35 |
+
# """Initialize the Academic Paper Q&A system with Groq API"""
|
| 36 |
+
# self.data_dir = Path("./papers")
|
| 37 |
+
# self.storage_dir = Path("./storage")
|
| 38 |
+
# self.model_name = model_name
|
| 39 |
+
# self.groq_api_key = groq_api_key
|
| 40 |
+
|
| 41 |
+
# # Create directories
|
| 42 |
+
# self.data_dir.mkdir(exist_ok=True)
|
| 43 |
+
# self.storage_dir.mkdir(exist_ok=True)
|
| 44 |
+
|
| 45 |
+
# # Initialize models
|
| 46 |
+
# self._setup_models()
|
| 47 |
+
|
| 48 |
+
# # Initialize index and chat engine
|
| 49 |
+
# self.index = None
|
| 50 |
+
# self.query_engine = None
|
| 51 |
+
# self.chat_engine = None
|
| 52 |
+
# self.current_papers_hash = None
|
| 53 |
+
# self.is_ready = False
|
| 54 |
+
|
| 55 |
+
# # Chat history
|
| 56 |
+
# self.chat_history = []
|
| 57 |
+
|
| 58 |
+
# def _setup_models(self):
|
| 59 |
+
# """Setup LLM and embedding models with Groq API"""
|
| 60 |
+
# try:
|
| 61 |
+
# if not self.groq_api_key:
|
| 62 |
+
# raise ValueError("Groq API key is required. Please set GROQ_API_KEY environment variable or pass it directly.")
|
| 63 |
+
|
| 64 |
+
# # Initialize LLM via Groq API with settings optimized for detailed responses
|
| 65 |
+
# self.llm = Groq(
|
| 66 |
+
# model=self.model_name,
|
| 67 |
+
# api_key=self.groq_api_key,
|
| 68 |
+
# temperature=0.3, # Slightly higher for more creative/detailed responses
|
| 69 |
+
# max_tokens=4096, # Maximum tokens for response
|
| 70 |
+
# top_p=0.9, # Nucleus sampling for better quality
|
| 71 |
+
# system_prompt="""You are an expert academic research assistant. When answering questions about research papers, provide comprehensive, detailed responses that include:
|
| 72 |
+
|
| 73 |
+
# 1. Direct answers to the question asked
|
| 74 |
+
# 2. Relevant background context and explanations
|
| 75 |
+
# 3. Specific details from the papers including methodologies, findings, and implications
|
| 76 |
+
# 4. Analysis and interpretation of the information
|
| 77 |
+
# 5. Connections between different concepts when relevant
|
| 78 |
+
# 6. Limitations or caveats when appropriate
|
| 79 |
+
|
| 80 |
+
# Always aim for thorough, well-structured responses that demonstrate deep understanding of the academic content. Use clear paragraphs and explain technical concepts when necessary."""
|
| 81 |
+
# )
|
| 82 |
+
|
| 83 |
+
# # Initialize lightweight embedding model for CPU usage
|
| 84 |
+
# self.embed_model = HuggingFaceEmbedding(
|
| 85 |
+
# model_name="sentence-transformers/all-MiniLM-L6-v2",
|
| 86 |
+
# device="cpu" # Explicitly use CPU
|
| 87 |
+
# )
|
| 88 |
+
|
| 89 |
+
# # Configure global settings
|
| 90 |
+
# Settings.llm = self.llm
|
| 91 |
+
# Settings.embed_model = self.embed_model
|
| 92 |
+
# Settings.chunk_size = 512 # Smaller chunks for better CPU performance
|
| 93 |
+
# Settings.chunk_overlap = 50 # Reduced overlap for CPU efficiency
|
| 94 |
+
|
| 95 |
+
# logger.info(f"Models initialized successfully with {self.model_name} via Groq API")
|
| 96 |
+
|
| 97 |
+
# except Exception as e:
|
| 98 |
+
# logger.error(f"Error setting up models: {e}")
|
| 99 |
+
# raise
|
| 100 |
+
|
| 101 |
+
# def _get_papers_hash(self) -> str:
|
| 102 |
+
# """Generate hash of current papers in directory"""
|
| 103 |
+
# pdf_files = list(self.data_dir.glob("*.pdf"))
|
| 104 |
+
# if not pdf_files:
|
| 105 |
+
# return ""
|
| 106 |
+
|
| 107 |
+
# # Create hash based on filenames and file sizes
|
| 108 |
+
# file_info = []
|
| 109 |
+
# for pdf_file in sorted(pdf_files):
|
| 110 |
+
# file_info.append(f"{pdf_file.name}:{pdf_file.stat().st_size}")
|
| 111 |
+
|
| 112 |
+
# papers_string = "|".join(file_info)
|
| 113 |
+
# return hashlib.md5(papers_string.encode()).hexdigest()
|
| 114 |
+
|
| 115 |
+
# def _save_papers_metadata(self, papers_hash: str):
|
| 116 |
+
# """Save metadata about current papers"""
|
| 117 |
+
# metadata_file = self.storage_dir / "papers_metadata.json"
|
| 118 |
+
# metadata = {
|
| 119 |
+
# "papers_hash": papers_hash,
|
| 120 |
+
# "model_name": self.model_name
|
| 121 |
+
# }
|
| 122 |
+
# with open(metadata_file, "w") as f:
|
| 123 |
+
# json.dump(metadata, f)
|
| 124 |
+
|
| 125 |
+
# def _load_papers_metadata(self) -> Dict:
|
| 126 |
+
# """Load metadata about papers"""
|
| 127 |
+
# metadata_file = self.storage_dir / "papers_metadata.json"
|
| 128 |
+
# if metadata_file.exists():
|
| 129 |
+
# with open(metadata_file, "r") as f:
|
| 130 |
+
# return json.load(f)
|
| 131 |
+
# return {}
|
| 132 |
+
|
| 133 |
+
# def download_arxiv_paper(self, arxiv_id: str) -> Optional[str]:
|
| 134 |
+
# """Download paper from arXiv"""
|
| 135 |
+
# try:
|
| 136 |
+
# search = arxiv.Search(id_list=[arxiv_id])
|
| 137 |
+
# paper = next(search.results())
|
| 138 |
+
|
| 139 |
+
# filename = f"{arxiv_id.replace('/', '_')}.pdf"
|
| 140 |
+
# filepath = self.data_dir / filename
|
| 141 |
+
|
| 142 |
+
# paper.download_pdf(dirpath=str(self.data_dir), filename=filename)
|
| 143 |
+
|
| 144 |
+
# logger.info(f"Downloaded paper: {paper.title}")
|
| 145 |
+
# return str(filepath)
|
| 146 |
+
|
| 147 |
+
# except Exception as e:
|
| 148 |
+
# logger.error(f"Error downloading paper {arxiv_id}: {e}")
|
| 149 |
+
# return None
|
| 150 |
+
|
| 151 |
+
# def load_documents(self, file_paths: List[str] = None) -> List[Document]:
|
| 152 |
+
# """Load documents from PDF files"""
|
| 153 |
+
# try:
|
| 154 |
+
# if file_paths is None:
|
| 155 |
+
# reader = SimpleDirectoryReader(
|
| 156 |
+
# input_dir=str(self.data_dir),
|
| 157 |
+
# required_exts=[".pdf"]
|
| 158 |
+
# )
|
| 159 |
+
# else:
|
| 160 |
+
# reader = SimpleDirectoryReader(input_files=file_paths)
|
| 161 |
+
|
| 162 |
+
# documents = reader.load_data()
|
| 163 |
+
# logger.info(f"Loaded {len(documents)} documents")
|
| 164 |
+
|
| 165 |
+
# return documents
|
| 166 |
+
|
| 167 |
+
# except Exception as e:
|
| 168 |
+
# logger.error(f"Error loading documents: {e}")
|
| 169 |
+
# return []
|
| 170 |
+
|
| 171 |
+
# def create_index(self, documents: List[Document], save_index: bool = True):
|
| 172 |
+
# """Create vector index from documents with CPU-optimized settings"""
|
| 173 |
+
# try:
|
| 174 |
+
# if not documents:
|
| 175 |
+
# raise ValueError("No documents provided for indexing")
|
| 176 |
+
|
| 177 |
+
# logger.info(f"Creating index from {len(documents)} documents")
|
| 178 |
+
|
| 179 |
+
# # CPU-optimized sentence splitter
|
| 180 |
+
# sentence_splitter = SentenceSplitter(
|
| 181 |
+
# chunk_size=512, # Smaller chunks for CPU
|
| 182 |
+
# chunk_overlap=50
|
| 183 |
+
# )
|
| 184 |
+
|
| 185 |
+
# self.index = VectorStoreIndex.from_documents(
|
| 186 |
+
# documents,
|
| 187 |
+
# transformations=[sentence_splitter],
|
| 188 |
+
# show_progress=True
|
| 189 |
+
# )
|
| 190 |
+
|
| 191 |
+
# if save_index:
|
| 192 |
+
# self.index.storage_context.persist(persist_dir=str(self.storage_dir))
|
| 193 |
+
# current_hash = self._get_papers_hash()
|
| 194 |
+
# self._save_papers_metadata(current_hash)
|
| 195 |
+
# self.current_papers_hash = current_hash
|
| 196 |
+
# logger.info("Index saved to storage")
|
| 197 |
+
|
| 198 |
+
# self._create_query_engine()
|
| 199 |
+
# self._create_chat_engine()
|
| 200 |
+
# self.is_ready = True
|
| 201 |
+
# logger.info("Vector index created successfully")
|
| 202 |
+
|
| 203 |
+
# except Exception as e:
|
| 204 |
+
# logger.error(f"Error creating index: {e}")
|
| 205 |
+
# self.is_ready = False
|
| 206 |
+
# raise
|
| 207 |
+
|
| 208 |
+
# def should_rebuild_index(self) -> bool:
|
| 209 |
+
# """Check if index should be rebuilt based on papers"""
|
| 210 |
+
# current_hash = self._get_papers_hash()
|
| 211 |
+
|
| 212 |
+
# if not current_hash:
|
| 213 |
+
# return False
|
| 214 |
+
|
| 215 |
+
# metadata = self._load_papers_metadata()
|
| 216 |
+
|
| 217 |
+
# if not metadata:
|
| 218 |
+
# logger.info("No metadata found, rebuilding index")
|
| 219 |
+
# return True
|
| 220 |
+
|
| 221 |
+
# if metadata.get("papers_hash") != current_hash:
|
| 222 |
+
# logger.info("Papers hash changed, rebuilding index")
|
| 223 |
+
# return True
|
| 224 |
+
|
| 225 |
+
# if metadata.get("model_name") != self.model_name:
|
| 226 |
+
# logger.info("Model changed, rebuilding index")
|
| 227 |
+
# return True
|
| 228 |
+
|
| 229 |
+
# return False
|
| 230 |
+
|
| 231 |
+
# def load_index(self) -> bool:
|
| 232 |
+
# """Load existing index from storage if it matches current papers"""
|
| 233 |
+
# try:
|
| 234 |
+
# if self.should_rebuild_index():
|
| 235 |
+
# logger.info("Index needs to be rebuilt due to changes")
|
| 236 |
+
# return False
|
| 237 |
+
|
| 238 |
+
# index_files = list(self.storage_dir.glob("*"))
|
| 239 |
+
# if not index_files:
|
| 240 |
+
# logger.info("No index files found")
|
| 241 |
+
# return False
|
| 242 |
+
|
| 243 |
+
# storage_context = StorageContext.from_defaults(
|
| 244 |
+
# persist_dir=str(self.storage_dir)
|
| 245 |
+
# )
|
| 246 |
+
# self.index = load_index_from_storage(storage_context)
|
| 247 |
+
# self._create_query_engine()
|
| 248 |
+
# self._create_chat_engine()
|
| 249 |
+
# self.current_papers_hash = self._get_papers_hash()
|
| 250 |
+
# self.is_ready = True
|
| 251 |
+
|
| 252 |
+
# logger.info("Index loaded from storage successfully")
|
| 253 |
+
# return True
|
| 254 |
+
|
| 255 |
+
# except Exception as e:
|
| 256 |
+
# logger.error(f"Error loading index: {e}")
|
| 257 |
+
# self.is_ready = False
|
| 258 |
+
# return False
|
| 259 |
+
|
| 260 |
+
# def _create_query_engine(self):
|
| 261 |
+
# """Create query engine with settings for detailed responses"""
|
| 262 |
+
# try:
|
| 263 |
+
# if not self.index:
|
| 264 |
+
# raise ValueError("No index available for query engine")
|
| 265 |
+
|
| 266 |
+
# retriever = VectorIndexRetriever(
|
| 267 |
+
# index=self.index,
|
| 268 |
+
# similarity_top_k=3 # Reduced for CPU efficiency
|
| 269 |
+
# )
|
| 270 |
+
|
| 271 |
+
# response_synthesizer = get_response_synthesizer(
|
| 272 |
+
# response_mode="compact", # Better for detailed responses
|
| 273 |
+
# streaming=False,
|
| 274 |
+
# text_qa_template="""Context information is below.
|
| 275 |
+
# ---------------------
|
| 276 |
+
# {context_str}
|
| 277 |
+
# ---------------------
|
| 278 |
+
# Given the context information and not prior knowledge, please provide a comprehensive and detailed answer to the question. Include specific details from the research papers, explain methodologies when relevant, discuss findings thoroughly, and provide analysis and implications. Structure your response clearly with proper explanations of technical concepts.
|
| 279 |
+
|
| 280 |
+
# Question: {query_str}
|
| 281 |
+
# Answer: """
|
| 282 |
+
# )
|
| 283 |
+
|
| 284 |
+
# self.query_engine = RetrieverQueryEngine(
|
| 285 |
+
# retriever=retriever,
|
| 286 |
+
# response_synthesizer=response_synthesizer
|
| 287 |
+
# )
|
| 288 |
+
|
| 289 |
+
# logger.info("Query engine created successfully")
|
| 290 |
+
|
| 291 |
+
# except Exception as e:
|
| 292 |
+
# logger.error(f"Error creating query engine: {e}")
|
| 293 |
+
# raise
|
| 294 |
+
|
| 295 |
+
# def _create_chat_engine(self):
|
| 296 |
+
# """Create chat engine for conversational interactions"""
|
| 297 |
+
# try:
|
| 298 |
+
# if not self.index:
|
| 299 |
+
# raise ValueError("No index available for chat engine")
|
| 300 |
+
|
| 301 |
+
# # Create memory buffer for chat history
|
| 302 |
+
# memory = ChatMemoryBuffer.from_defaults(token_limit=2000) # Reduced for efficiency
|
| 303 |
+
|
| 304 |
+
# # Create chat engine
|
| 305 |
+
# self.chat_engine = CondensePlusContextChatEngine.from_defaults(
|
| 306 |
+
# retriever=VectorIndexRetriever(
|
| 307 |
+
# index=self.index,
|
| 308 |
+
# similarity_top_k=3
|
| 309 |
+
# ),
|
| 310 |
+
# memory=memory,
|
| 311 |
+
# llm=self.llm,
|
| 312 |
+
# context_prompt=(
|
| 313 |
+
# "You are an expert academic research assistant having a conversation about research papers. "
|
| 314 |
+
# "Use the following context from the papers to answer questions thoroughly and in detail. "
|
| 315 |
+
# "Provide comprehensive explanations, include specific findings, methodologies, and implications. "
|
| 316 |
+
# "Build upon previous parts of the conversation when relevant.\n"
|
| 317 |
+
# "Context:\n"
|
| 318 |
+
# "{context_str}\n"
|
| 319 |
+
# "Instructions: Answer the user's question in detail using the provided context."
|
| 320 |
+
# ),
|
| 321 |
+
# verbose=True
|
| 322 |
+
# )
|
| 323 |
+
|
| 324 |
+
# logger.info("Chat engine created successfully")
|
| 325 |
+
|
| 326 |
+
# except Exception as e:
|
| 327 |
+
# logger.error(f"Error creating chat engine: {e}")
|
| 328 |
+
# raise
|
| 329 |
+
|
| 330 |
+
# def get_loaded_papers_info(self) -> List[str]:
|
| 331 |
+
# """Get list of currently loaded papers"""
|
| 332 |
+
# pdf_files = list(self.data_dir.glob("*.pdf"))
|
| 333 |
+
# return [pdf_file.name for pdf_file in pdf_files]
|
| 334 |
+
|
| 335 |
+
# def clear_papers(self):
|
| 336 |
+
# """Clear all papers and reset index"""
|
| 337 |
+
# try:
|
| 338 |
+
# # Remove all PDF files
|
| 339 |
+
# for pdf_file in self.data_dir.glob("*.pdf"):
|
| 340 |
+
# pdf_file.unlink()
|
| 341 |
+
|
| 342 |
+
# # Clear storage
|
| 343 |
+
# if self.storage_dir.exists():
|
| 344 |
+
# import shutil
|
| 345 |
+
# shutil.rmtree(self.storage_dir)
|
| 346 |
+
# self.storage_dir.mkdir(exist_ok=True)
|
| 347 |
+
|
| 348 |
+
# # Reset everything
|
| 349 |
+
# self.index = None
|
| 350 |
+
# self.query_engine = None
|
| 351 |
+
# self.chat_engine = None
|
| 352 |
+
# self.current_papers_hash = None
|
| 353 |
+
# self.is_ready = False
|
| 354 |
+
# self.chat_history = []
|
| 355 |
+
|
| 356 |
+
# logger.info("Papers and index cleared")
|
| 357 |
+
# return True
|
| 358 |
+
|
| 359 |
+
# except Exception as e:
|
| 360 |
+
# logger.error(f"Error clearing papers: {e}")
|
| 361 |
+
# return False
|
| 362 |
+
|
| 363 |
+
# def clear_chat_history(self):
|
| 364 |
+
# """Clear chat history and reset memory"""
|
| 365 |
+
# try:
|
| 366 |
+
# self.chat_history = []
|
| 367 |
+
# if self.chat_engine and hasattr(self.chat_engine, 'memory'):
|
| 368 |
+
# self.chat_engine.memory.reset()
|
| 369 |
+
# logger.info("Chat history cleared")
|
| 370 |
+
# except Exception as e:
|
| 371 |
+
# logger.error(f"Error clearing chat history: {e}")
|
| 372 |
+
|
| 373 |
+
# def process_all_papers(self) -> Dict[str, str]:
|
| 374 |
+
# """Process all papers in the directory and create/load index"""
|
| 375 |
+
# try:
|
| 376 |
+
# current_papers = self.get_loaded_papers_info()
|
| 377 |
+
# if not current_papers:
|
| 378 |
+
# return {"error": "No papers found in directory"}
|
| 379 |
+
|
| 380 |
+
# logger.info(f"Processing {len(current_papers)} papers: {current_papers}")
|
| 381 |
+
|
| 382 |
+
# if self.load_index():
|
| 383 |
+
# return {"success": f"Loaded existing index for {len(current_papers)} papers"}
|
| 384 |
+
|
| 385 |
+
# logger.info("Creating new index from documents...")
|
| 386 |
+
# documents = self.load_documents()
|
| 387 |
+
|
| 388 |
+
# if not documents:
|
| 389 |
+
# return {"error": "Failed to load documents from PDF files"}
|
| 390 |
+
|
| 391 |
+
# self.create_index(documents)
|
| 392 |
+
|
| 393 |
+
# if self.is_ready:
|
| 394 |
+
# return {"success": f"Successfully created index for {len(current_papers)} papers"}
|
| 395 |
+
# else:
|
| 396 |
+
# return {"error": "Failed to create index"}
|
| 397 |
+
|
| 398 |
+
# except Exception as e:
|
| 399 |
+
# logger.error(f"Error processing papers: {e}")
|
| 400 |
+
# return {"error": f"Error processing papers: {str(e)}"}
|
| 401 |
+
|
| 402 |
+
# def ask_question(self, question: str, use_chat_engine: bool = True) -> Dict[str, any]:
|
| 403 |
+
# """Ask a question using either chat engine (conversational) or query engine (standalone)"""
|
| 404 |
+
# if not self.is_ready:
|
| 405 |
+
# return {"error": "System not ready. Please process papers first."}
|
| 406 |
+
|
| 407 |
+
# try:
|
| 408 |
+
# logger.info(f"Asking question: {question}")
|
| 409 |
+
|
| 410 |
+
# if use_chat_engine and self.chat_engine:
|
| 411 |
+
# # Use chat engine for conversational context
|
| 412 |
+
# response = self.chat_engine.chat(question)
|
| 413 |
+
# answer = str(response)
|
| 414 |
+
|
| 415 |
+
# # Add to chat history
|
| 416 |
+
# self.chat_history.append({
|
| 417 |
+
# "timestamp": datetime.now().strftime("%H:%M:%S"),
|
| 418 |
+
# "question": question,
|
| 419 |
+
# "answer": answer,
|
| 420 |
+
# "type": "chat"
|
| 421 |
+
# })
|
| 422 |
+
|
| 423 |
+
# # Get sources if available
|
| 424 |
+
# sources = []
|
| 425 |
+
# if hasattr(response, 'source_nodes') and response.source_nodes:
|
| 426 |
+
# for i, node in enumerate(response.source_nodes):
|
| 427 |
+
# sources.append({
|
| 428 |
+
# 'text': node.text[:400] + "..." if len(node.text) > 400 else node.text,
|
| 429 |
+
# 'score': node.score if hasattr(node, 'score') else 'N/A'
|
| 430 |
+
# })
|
| 431 |
+
|
| 432 |
+
# else:
|
| 433 |
+
# # Use query engine for standalone questions
|
| 434 |
+
# response = self.query_engine.query(question)
|
| 435 |
+
# answer = str(response)
|
| 436 |
+
|
| 437 |
+
# # Add to chat history
|
| 438 |
+
# self.chat_history.append({
|
| 439 |
+
# "timestamp": datetime.now().strftime("%H:%M:%S"),
|
| 440 |
+
# "question": question,
|
| 441 |
+
# "answer": answer,
|
| 442 |
+
# "type": "query"
|
| 443 |
+
# })
|
| 444 |
+
|
| 445 |
+
# sources = []
|
| 446 |
+
# if hasattr(response, 'source_nodes') and response.source_nodes:
|
| 447 |
+
# for i, node in enumerate(response.source_nodes):
|
| 448 |
+
# sources.append({
|
| 449 |
+
# 'text': node.text[:400] + "..." if len(node.text) > 400 else node.text,
|
| 450 |
+
# 'score': node.score if hasattr(node, 'score') else 'N/A'
|
| 451 |
+
# })
|
| 452 |
+
|
| 453 |
+
# logger.info(f"Generated answer length: {len(answer)} characters")
|
| 454 |
+
|
| 455 |
+
# return {
|
| 456 |
+
# "answer": answer,
|
| 457 |
+
# "sources": sources,
|
| 458 |
+
# "timestamp": datetime.now().strftime("%H:%M:%S")
|
| 459 |
+
# }
|
| 460 |
+
|
| 461 |
+
# except Exception as e:
|
| 462 |
+
# logger.error(f"Error answering question: {e}")
|
| 463 |
+
# return {"error": f"Error processing question: {str(e)}"}
|
| 464 |
+
|
| 465 |
+
# def create_streamlit_app():
|
| 466 |
+
# """Create Streamlit web interface with chat functionality"""
|
| 467 |
+
# st.set_page_config(
|
| 468 |
+
# page_title="Academic Paper Q&A Bot (Groq Powered)",
|
| 469 |
+
# page_icon="π¬",
|
| 470 |
+
# layout="wide"
|
| 471 |
+
# )
|
| 472 |
+
|
| 473 |
+
# st.title("π¬ Academic Paper Q&A Bot (Groq Powered)")
|
| 474 |
+
|
| 475 |
+
# # Custom CSS for chat interface
|
| 476 |
+
# st.markdown("""
|
| 477 |
+
# <style>
|
| 478 |
+
# .chat-message {
|
| 479 |
+
# padding: 1rem;
|
| 480 |
+
# border-radius: 0.5rem;
|
| 481 |
+
# margin-bottom: 1rem;
|
| 482 |
+
# display: flex;
|
| 483 |
+
# flex-direction: column;
|
| 484 |
+
# }
|
| 485 |
+
# .user-message {
|
| 486 |
+
# background-color: #e3f2fd;
|
| 487 |
+
# margin-left: 20%;
|
| 488 |
+
# }
|
| 489 |
+
# .bot-message {
|
| 490 |
+
# background-color: #f5f5f5;
|
| 491 |
+
# margin-right: 20%;
|
| 492 |
+
# }
|
| 493 |
+
# .message-content {
|
| 494 |
+
# margin: 0.5rem 0;
|
| 495 |
+
# }
|
| 496 |
+
# .message-timestamp {
|
| 497 |
+
# font-size: 0.8rem;
|
| 498 |
+
# color: #666;
|
| 499 |
+
# align-self: flex-end;
|
| 500 |
+
# }
|
| 501 |
+
# .stChatInputContainer {
|
| 502 |
+
# position: fixed;
|
| 503 |
+
# bottom: 0;
|
| 504 |
+
# background: white;
|
| 505 |
+
# padding: 1rem;
|
| 506 |
+
# border-top: 1px solid #e0e0e0;
|
| 507 |
+
# }
|
| 508 |
+
# </style>
|
| 509 |
+
# """, unsafe_allow_html=True)
|
| 510 |
+
|
| 511 |
+
# # API Key configuration in sidebar
|
| 512 |
+
# st.sidebar.header("π API Configuration")
|
| 513 |
+
# groq_api_key = st.sidebar.text_input(
|
| 514 |
+
# "Groq API Key:",
|
| 515 |
+
# type="password",
|
| 516 |
+
# help="Get your free API key from https://console.groq.com/keys"
|
| 517 |
+
# )
|
| 518 |
+
|
| 519 |
+
# if not groq_api_key:
|
| 520 |
+
# groq_api_key = os.getenv("GROQ_API_KEY")
|
| 521 |
+
|
| 522 |
+
# if not groq_api_key:
|
| 523 |
+
# st.sidebar.error("Please enter your Groq API key or set GROQ_API_KEY environment variable")
|
| 524 |
+
# st.info("π **To get started:**\n1. Go to https://console.groq.com/keys\n2. Create a free account\n3. Generate an API key\n4. Enter it in the sidebar")
|
| 525 |
+
# st.stop()
|
| 526 |
+
|
| 527 |
+
# # Model selection in sidebar
|
| 528 |
+
# st.sidebar.header("βοΈ Configuration")
|
| 529 |
+
# model_options = {
|
| 530 |
+
# "Llama3 70B (Most Capable)": "llama3-70b-8192",
|
| 531 |
+
# "Llama3 8B (Fast)": "llama3-8b-8192",
|
| 532 |
+
# "Mixtral 8x7B (Balanced)": "mixtral-8x7b-32768",
|
| 533 |
+
# "Gemma 7B (Efficient)": "gemma-7b-it"
|
| 534 |
+
# }
|
| 535 |
+
|
| 536 |
+
# selected_model = st.sidebar.selectbox(
|
| 537 |
+
# "Choose Groq Model:",
|
| 538 |
+
# list(model_options.keys()),
|
| 539 |
+
# index=0
|
| 540 |
+
# )
|
| 541 |
+
|
| 542 |
+
# model_name = model_options[selected_model]
|
| 543 |
+
|
| 544 |
+
# # Initialize session state
|
| 545 |
+
# if ('qa_system' not in st.session_state or
|
| 546 |
+
# st.session_state.get('current_model') != model_name or
|
| 547 |
+
# st.session_state.get('current_api_key') != groq_api_key):
|
| 548 |
+
|
| 549 |
+
# with st.spinner(f"Initializing system with {selected_model}..."):
|
| 550 |
+
# try:
|
| 551 |
+
# st.session_state.qa_system = AcademicPaperQA(
|
| 552 |
+
# model_name=model_name,
|
| 553 |
+
# groq_api_key=groq_api_key
|
| 554 |
+
# )
|
| 555 |
+
# st.session_state.current_model = model_name
|
| 556 |
+
# st.session_state.current_api_key = groq_api_key
|
| 557 |
+
# st.session_state.papers_loaded = False
|
| 558 |
+
# st.success(f"System initialized with {selected_model} via Groq API!")
|
| 559 |
+
# except Exception as e:
|
| 560 |
+
# st.error(f"Error initializing system: {e}")
|
| 561 |
+
# st.info("Please check your Groq API key and try again.")
|
| 562 |
+
# st.stop()
|
| 563 |
+
|
| 564 |
+
# if 'papers_loaded' not in st.session_state:
|
| 565 |
+
# st.session_state.papers_loaded = False
|
| 566 |
+
|
| 567 |
+
# # Display current model info
|
| 568 |
+
# st.sidebar.info(f"**Current model:** {selected_model}")
|
| 569 |
+
# st.sidebar.success("β
Using Groq API (Cloud)")
|
| 570 |
+
|
| 571 |
+
# # Show system status
|
| 572 |
+
# if hasattr(st.session_state.qa_system, 'is_ready'):
|
| 573 |
+
# if st.session_state.qa_system.is_ready:
|
| 574 |
+
# st.sidebar.success("β
System Ready")
|
| 575 |
+
# else:
|
| 576 |
+
# st.sidebar.warning("β οΈ Process papers first")
|
| 577 |
+
|
| 578 |
+
# # Show currently loaded papers
|
| 579 |
+
# current_papers = st.session_state.qa_system.get_loaded_papers_info()
|
| 580 |
+
# if current_papers:
|
| 581 |
+
# st.sidebar.subheader("π Loaded Papers:")
|
| 582 |
+
# for paper in current_papers:
|
| 583 |
+
# st.sidebar.text(f"π {paper}")
|
| 584 |
+
|
| 585 |
+
# if st.sidebar.button("ποΈ Clear All Papers"):
|
| 586 |
+
# with st.spinner("Clearing papers..."):
|
| 587 |
+
# if st.session_state.qa_system.clear_papers():
|
| 588 |
+
# st.session_state.papers_loaded = False
|
| 589 |
+
# st.sidebar.success("Papers cleared!")
|
| 590 |
+
# st.rerun()
|
| 591 |
+
|
| 592 |
+
# # Chat controls in sidebar
|
| 593 |
+
# st.sidebar.subheader("π¬ Chat Controls")
|
| 594 |
+
# if st.sidebar.button("π§Ή Clear Chat History"):
|
| 595 |
+
# st.session_state.qa_system.clear_chat_history()
|
| 596 |
+
# st.sidebar.success("Chat cleared!")
|
| 597 |
+
# st.rerun()
|
| 598 |
+
|
| 599 |
+
# # Response mode toggle
|
| 600 |
+
# use_chat_mode = st.sidebar.toggle("π¬ Conversational Mode", value=True,
|
| 601 |
+
# help="Enable for follow-up questions and context retention")
|
| 602 |
+
|
| 603 |
+
# # Main interface
|
| 604 |
+
# if not st.session_state.qa_system.is_ready:
|
| 605 |
+
# # Show paper loading interface when system not ready
|
| 606 |
+
# st.header("π₯ Load Academic Papers")
|
| 607 |
+
|
| 608 |
+
# col1, col2 = st.columns([1, 1])
|
| 609 |
+
|
| 610 |
+
# with col1:
|
| 611 |
+
# st.subheader("From arXiv")
|
| 612 |
+
# arxiv_id = st.text_input("Enter arXiv ID (e.g., 2301.00001)")
|
| 613 |
+
# if st.button("Download from arXiv"):
|
| 614 |
+
# if arxiv_id:
|
| 615 |
+
# with st.spinner("Downloading paper..."):
|
| 616 |
+
# filepath = st.session_state.qa_system.download_arxiv_paper(arxiv_id)
|
| 617 |
+
# if filepath:
|
| 618 |
+
# st.success(f"Downloaded paper")
|
| 619 |
+
# st.session_state.papers_loaded = False
|
| 620 |
+
# else:
|
| 621 |
+
# st.error("Failed to download paper")
|
| 622 |
+
|
| 623 |
+
# with col2:
|
| 624 |
+
# st.subheader("Upload PDF Files")
|
| 625 |
+
# uploaded_files = st.file_uploader(
|
| 626 |
+
# "Choose PDF files",
|
| 627 |
+
# type="pdf",
|
| 628 |
+
# accept_multiple_files=True
|
| 629 |
+
# )
|
| 630 |
+
|
| 631 |
+
# if uploaded_files:
|
| 632 |
+
# saved_files = []
|
| 633 |
+
# for uploaded_file in uploaded_files:
|
| 634 |
+
# file_path = st.session_state.qa_system.data_dir / uploaded_file.name
|
| 635 |
+
# with open(file_path, "wb") as f:
|
| 636 |
+
# f.write(uploaded_file.getbuffer())
|
| 637 |
+
# saved_files.append(str(file_path))
|
| 638 |
+
|
| 639 |
+
# st.success(f"Uploaded {len(saved_files)} files")
|
| 640 |
+
# st.session_state.papers_loaded = False
|
| 641 |
+
|
| 642 |
+
# # Process papers
|
| 643 |
+
# st.subheader("π Process Papers")
|
| 644 |
+
# current_papers = st.session_state.qa_system.get_loaded_papers_info()
|
| 645 |
+
|
| 646 |
+
# if not current_papers:
|
| 647 |
+
# st.info("No papers found. Please upload or download papers first.")
|
| 648 |
+
# else:
|
| 649 |
+
# st.info(f"Found {len(current_papers)} paper(s): {', '.join(current_papers)}")
|
| 650 |
+
|
| 651 |
+
# if st.button("π Process Papers", type="primary"):
|
| 652 |
+
# with st.spinner("Processing papers (creating embeddings on CPU)..."):
|
| 653 |
+
# result = st.session_state.qa_system.process_all_papers()
|
| 654 |
+
|
| 655 |
+
# if "error" in result:
|
| 656 |
+
# st.error(result["error"])
|
| 657 |
+
# st.session_state.papers_loaded = False
|
| 658 |
+
# else:
|
| 659 |
+
# st.success(result["success"])
|
| 660 |
+
# st.session_state.papers_loaded = True
|
| 661 |
+
# st.rerun()
|
| 662 |
+
|
| 663 |
+
# else:
|
| 664 |
+
# # Main chat interface when system is ready
|
| 665 |
+
# st.header("π¬ Chat with Your Papers")
|
| 666 |
+
|
| 667 |
+
# # Show loaded papers info
|
| 668 |
+
# loaded_papers = st.session_state.qa_system.get_loaded_papers_info()
|
| 669 |
+
# st.info(f"π Chatting with {len(loaded_papers)} paper(s): {', '.join(loaded_papers)}")
|
| 670 |
+
|
| 671 |
+
# # Chat history display
|
| 672 |
+
# chat_container = st.container()
|
| 673 |
+
|
| 674 |
+
# with chat_container:
|
| 675 |
+
# # Display chat history
|
| 676 |
+
# for i, message in enumerate(st.session_state.qa_system.chat_history[-10:]): # Show last 10 messages
|
| 677 |
+
# # User message
|
| 678 |
+
# st.markdown(f"""
|
| 679 |
+
# <div class="chat-message user-message">
|
| 680 |
+
# <div class="message-content"><strong style="color: black;">You:</strong> {message['question']}</div>
|
| 681 |
+
# <div class="message-timestamp">{message['timestamp']}</div>
|
| 682 |
+
# </div>
|
| 683 |
+
# """, unsafe_allow_html=True)
|
| 684 |
+
|
| 685 |
+
# # Bot response
|
| 686 |
+
# st.markdown(f"""
|
| 687 |
+
# <div class="chat-message bot-message">
|
| 688 |
+
# <div class="message-content"><strong style="color: black;">Assistant:</strong></div>
|
| 689 |
+
# </div>
|
| 690 |
+
# """, unsafe_allow_html=True)
|
| 691 |
+
|
| 692 |
+
# st.write(message['answer'])
|
| 693 |
+
# st.markdown("---")
|
| 694 |
+
|
| 695 |
+
# # Quick question buttons
|
| 696 |
+
# st.subheader("π Quick Questions")
|
| 697 |
+
# col1, col2, col3 = st.columns(3)
|
| 698 |
+
|
| 699 |
+
# quick_question = None
|
| 700 |
+
# with col1:
|
| 701 |
+
# if st.button("π― Main Research Question"):
|
| 702 |
+
# quick_question = "What is the main research question or objective addressed in this paper? Please provide a detailed explanation."
|
| 703 |
+
# if st.button("π¬ Methodology"):
|
| 704 |
+
# quick_question = "What methodology or research approach was used in this study? Please explain in detail including any experimental design, data collection methods, and analytical techniques."
|
| 705 |
+
|
| 706 |
+
# with col2:
|
| 707 |
+
# if st.button("π Key Findings"):
|
| 708 |
+
# quick_question = "What are the key findings and results of this research? Please provide a comprehensive summary of the main discoveries and their significance."
|
| 709 |
+
# if st.button("π― Conclusions"):
|
| 710 |
+
# quick_question = "What are the main conclusions and implications of this research? How do the authors interpret their findings?"
|
| 711 |
+
|
| 712 |
+
# with col3:
|
| 713 |
+
# if st.button("β οΈ Limitations"):
|
| 714 |
+
# quick_question = "What are the limitations of this study? What do the authors identify as potential weaknesses or areas for future research?"
|
| 715 |
+
# if st.button("π Summary"):
|
| 716 |
+
# quick_question = "Please provide a comprehensive summary of this paper including the research question, methodology, key findings, and conclusions."
|
| 717 |
+
|
| 718 |
+
# # Chat input
|
| 719 |
+
# st.subheader("π Ask Your Question")
|
| 720 |
+
# user_question = st.text_area("Type your question here...", height=100, placeholder="Ask anything about your papers...")
|
| 721 |
+
|
| 722 |
+
# # Use quick question if selected, otherwise use user input
|
| 723 |
+
# question_to_ask = quick_question if quick_question else user_question
|
| 724 |
+
|
| 725 |
+
# col1, col2 = st.columns([3, 1])
|
| 726 |
+
# with col1:
|
| 727 |
+
# if st.button("Send Message", type="primary", disabled=not question_to_ask):
|
| 728 |
+
# if question_to_ask:
|
| 729 |
+
# with st.spinner("Thinking... (Processing via Groq API)"):
|
| 730 |
+
# result = st.session_state.qa_system.ask_question(
|
| 731 |
+
# question_to_ask,
|
| 732 |
+
# use_chat_engine=use_chat_mode
|
| 733 |
+
# )
|
| 734 |
+
|
| 735 |
+
# if "error" in result:
|
| 736 |
+
# st.error(result["error"])
|
| 737 |
+
# else:
|
| 738 |
+
# st.rerun() # Reload to show new message
|
| 739 |
+
|
| 740 |
+
# with col2:
|
| 741 |
+
# response_mode = "π¬ Chat Mode" if use_chat_mode else "β Q&A Mode"
|
| 742 |
+
# st.info(response_mode)
|
| 743 |
+
|
| 744 |
+
# # Sources section (show for last question if available)
|
| 745 |
+
# if (st.session_state.qa_system.chat_history and
|
| 746 |
+
# st.session_state.qa_system.chat_history[-1].get('sources')):
|
| 747 |
+
|
| 748 |
+
# with st.expander("π View Sources", expanded=False):
|
| 749 |
+
# sources = st.session_state.qa_system.chat_history[-1]['sources']
|
| 750 |
+
# for i, source in enumerate(sources, 1):
|
| 751 |
+
# st.markdown(f"**Source {i}** (Relevance: {source['score']})")
|
| 752 |
+
# st.text(source['text'])
|
| 753 |
+
# st.markdown("---")
|
| 754 |
+
|
| 755 |
+
# if __name__ == "__main__":
|
| 756 |
+
# create_streamlit_app()
|
| 757 |
+
|
| 758 |
+
|
| 759 |
+
|
| 760 |
+
|
| 761 |
+
import os
|
| 762 |
+
import requests
|
| 763 |
+
import arxiv
|
| 764 |
+
from pathlib import Path
|
| 765 |
+
from typing import List, Dict, Optional
|
| 766 |
import streamlit as st
|
| 767 |
+
from llama_index.core import (
|
| 768 |
+
VectorStoreIndex,
|
| 769 |
+
SimpleDirectoryReader,
|
| 770 |
+
Settings,
|
| 771 |
+
Document,
|
| 772 |
+
StorageContext,
|
| 773 |
+
load_index_from_storage
|
| 774 |
+
)
|
| 775 |
+
from llama_index.core.node_parser import SentenceSplitter
|
| 776 |
+
from llama_index.llms.groq import Groq
|
| 777 |
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 778 |
+
from llama_index.core.query_engine import RetrieverQueryEngine
|
| 779 |
+
from llama_index.core.retrievers import VectorIndexRetriever
|
| 780 |
+
from llama_index.core.response_synthesizers import get_response_synthesizer
|
| 781 |
+
from llama_index.core.memory import ChatMemoryBuffer
|
| 782 |
+
from llama_index.core.chat_engine import CondensePlusContextChatEngine
|
| 783 |
+
import logging
|
| 784 |
+
import hashlib
|
| 785 |
+
import json
|
| 786 |
+
import time
|
| 787 |
+
from datetime import datetime
|
| 788 |
+
|
| 789 |
+
# Configure logging
|
| 790 |
+
logging.basicConfig(level=logging.INFO)
|
| 791 |
+
logger = logging.getLogger(__name__)
|
| 792 |
+
|
| 793 |
+
class AcademicPaperQA:
|
| 794 |
+
def __init__(self, model_name="llama3-70b-8192", groq_api_key=None):
|
| 795 |
+
"""Initialize the Academic Paper Q&A system with Groq API"""
|
| 796 |
+
self.data_dir = Path("./papers")
|
| 797 |
+
self.storage_dir = Path("./storage")
|
| 798 |
+
self.model_name = model_name
|
| 799 |
+
self.groq_api_key = groq_api_key
|
| 800 |
+
|
| 801 |
+
# Create directories
|
| 802 |
+
self.data_dir.mkdir(exist_ok=True)
|
| 803 |
+
self.storage_dir.mkdir(exist_ok=True)
|
| 804 |
+
|
| 805 |
+
# Initialize models
|
| 806 |
+
self._setup_models()
|
| 807 |
+
|
| 808 |
+
# Initialize index and chat engine
|
| 809 |
+
self.index = None
|
| 810 |
+
self.query_engine = None
|
| 811 |
+
self.chat_engine = None
|
| 812 |
+
self.current_papers_hash = None
|
| 813 |
+
self.is_ready = False
|
| 814 |
+
|
| 815 |
+
# Chat history
|
| 816 |
+
self.chat_history = []
|
| 817 |
+
|
| 818 |
+
def _setup_models(self):
|
| 819 |
+
"""Setup LLM and embedding models with Groq API"""
|
| 820 |
+
try:
|
| 821 |
+
if not self.groq_api_key:
|
| 822 |
+
raise ValueError("Groq API key is required. Please set GROQ_API_KEY environment variable or pass it directly.")
|
| 823 |
+
|
| 824 |
+
# Initialize LLM via Groq API with conservative token settings
|
| 825 |
+
self.llm = Groq(
|
| 826 |
+
model=self.model_name,
|
| 827 |
+
api_key=self.groq_api_key,
|
| 828 |
+
temperature=0.3,
|
| 829 |
+
max_tokens=2048, # Reduced max tokens to prevent context overflow
|
| 830 |
+
top_p=0.9,
|
| 831 |
+
system_prompt="""You are an expert academic research assistant. Provide comprehensive, detailed responses about research papers including:
|
| 832 |
+
|
| 833 |
+
1. Direct answers to questions
|
| 834 |
+
2. Relevant background context
|
| 835 |
+
3. Specific details from papers including methodologies and findings
|
| 836 |
+
4. Analysis and interpretation
|
| 837 |
+
5. Connections between concepts when relevant
|
| 838 |
+
|
| 839 |
+
Keep responses thorough but concise to stay within token limits."""
|
| 840 |
+
)
|
| 841 |
+
|
| 842 |
+
# Initialize lightweight embedding model for CPU usage
|
| 843 |
+
# Using a more stable embedding model
|
| 844 |
+
try:
|
| 845 |
+
self.embed_model = HuggingFaceEmbedding(
|
| 846 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
| 847 |
+
device="cpu",
|
| 848 |
+
max_length=512 # Explicit max length to prevent issues
|
| 849 |
+
)
|
| 850 |
+
except Exception as e:
|
| 851 |
+
logger.warning(f"Failed to load HuggingFace embedding, trying alternative: {e}")
|
| 852 |
+
# Fallback to a different embedding model
|
| 853 |
+
self.embed_model = HuggingFaceEmbedding(
|
| 854 |
+
model_name="BAAI/bge-small-en-v1.5",
|
| 855 |
+
device="cpu",
|
| 856 |
+
max_length=512
|
| 857 |
+
)
|
| 858 |
+
|
| 859 |
+
# Configure global settings with conservative values
|
| 860 |
+
Settings.llm = self.llm
|
| 861 |
+
Settings.embed_model = self.embed_model
|
| 862 |
+
Settings.chunk_size = 256 # Smaller chunks to prevent context overflow
|
| 863 |
+
Settings.chunk_overlap = 25 # Reduced overlap
|
| 864 |
+
|
| 865 |
+
logger.info(f"Models initialized successfully with {self.model_name} via Groq API")
|
| 866 |
+
|
| 867 |
+
except Exception as e:
|
| 868 |
+
logger.error(f"Error setting up models: {e}")
|
| 869 |
+
raise
|
| 870 |
+
|
| 871 |
+
def _get_papers_hash(self) -> str:
|
| 872 |
+
"""Generate hash of current papers in directory"""
|
| 873 |
+
pdf_files = list(self.data_dir.glob("*.pdf"))
|
| 874 |
+
if not pdf_files:
|
| 875 |
+
return ""
|
| 876 |
+
|
| 877 |
+
# Create hash based on filenames and file sizes
|
| 878 |
+
file_info = []
|
| 879 |
+
for pdf_file in sorted(pdf_files):
|
| 880 |
+
file_info.append(f"{pdf_file.name}:{pdf_file.stat().st_size}")
|
| 881 |
+
|
| 882 |
+
papers_string = "|".join(file_info)
|
| 883 |
+
return hashlib.md5(papers_string.encode()).hexdigest()
|
| 884 |
+
|
| 885 |
+
def _save_papers_metadata(self, papers_hash: str):
|
| 886 |
+
"""Save metadata about current papers"""
|
| 887 |
+
metadata_file = self.storage_dir / "papers_metadata.json"
|
| 888 |
+
metadata = {
|
| 889 |
+
"papers_hash": papers_hash,
|
| 890 |
+
"model_name": self.model_name
|
| 891 |
+
}
|
| 892 |
+
with open(metadata_file, "w") as f:
|
| 893 |
+
json.dump(metadata, f)
|
| 894 |
+
|
| 895 |
+
def _load_papers_metadata(self) -> Dict:
|
| 896 |
+
"""Load metadata about papers"""
|
| 897 |
+
metadata_file = self.storage_dir / "papers_metadata.json"
|
| 898 |
+
if metadata_file.exists():
|
| 899 |
+
with open(metadata_file, "r") as f:
|
| 900 |
+
return json.load(f)
|
| 901 |
+
return {}
|
| 902 |
+
|
| 903 |
+
def download_arxiv_paper(self, arxiv_id: str) -> Optional[str]:
|
| 904 |
+
"""Download paper from arXiv"""
|
| 905 |
+
try:
|
| 906 |
+
search = arxiv.Search(id_list=[arxiv_id])
|
| 907 |
+
paper = next(search.results())
|
| 908 |
+
|
| 909 |
+
filename = f"{arxiv_id.replace('/', '_')}.pdf"
|
| 910 |
+
filepath = self.data_dir / filename
|
| 911 |
+
|
| 912 |
+
paper.download_pdf(dirpath=str(self.data_dir), filename=filename)
|
| 913 |
+
|
| 914 |
+
logger.info(f"Downloaded paper: {paper.title}")
|
| 915 |
+
return str(filepath)
|
| 916 |
+
|
| 917 |
+
except Exception as e:
|
| 918 |
+
logger.error(f"Error downloading paper {arxiv_id}: {e}")
|
| 919 |
+
return None
|
| 920 |
+
|
| 921 |
+
def load_documents(self, file_paths: List[str] = None) -> List[Document]:
|
| 922 |
+
"""Load documents from PDF files with error handling"""
|
| 923 |
+
try:
|
| 924 |
+
if file_paths is None:
|
| 925 |
+
reader = SimpleDirectoryReader(
|
| 926 |
+
input_dir=str(self.data_dir),
|
| 927 |
+
required_exts=[".pdf"],
|
| 928 |
+
recursive=False # Explicit setting
|
| 929 |
+
)
|
| 930 |
+
else:
|
| 931 |
+
reader = SimpleDirectoryReader(input_files=file_paths)
|
| 932 |
+
|
| 933 |
+
documents = reader.load_data()
|
| 934 |
+
logger.info(f"Loaded {len(documents)} documents")
|
| 935 |
+
|
| 936 |
+
# Clean and validate documents
|
| 937 |
+
cleaned_documents = []
|
| 938 |
+
for doc in documents:
|
| 939 |
+
if doc.text and len(doc.text.strip()) > 50: # Filter out very short documents
|
| 940 |
+
# Truncate very long documents to prevent memory issues
|
| 941 |
+
if len(doc.text) > 50000:
|
| 942 |
+
doc.text = doc.text[:50000] + "... [Document truncated]"
|
| 943 |
+
cleaned_documents.append(doc)
|
| 944 |
+
|
| 945 |
+
logger.info(f"After cleaning: {len(cleaned_documents)} valid documents")
|
| 946 |
+
return cleaned_documents
|
| 947 |
+
|
| 948 |
+
except Exception as e:
|
| 949 |
+
logger.error(f"Error loading documents: {e}")
|
| 950 |
+
return []
|
| 951 |
+
|
| 952 |
+
def create_index(self, documents: List[Document], save_index: bool = True):
|
| 953 |
+
"""Create vector index from documents with CPU-optimized settings"""
|
| 954 |
+
try:
|
| 955 |
+
if not documents:
|
| 956 |
+
raise ValueError("No documents provided for indexing")
|
| 957 |
+
|
| 958 |
+
logger.info(f"Creating index from {len(documents)} documents")
|
| 959 |
+
|
| 960 |
+
# CPU-optimized sentence splitter with smaller chunks
|
| 961 |
+
sentence_splitter = SentenceSplitter(
|
| 962 |
+
chunk_size=256, # Smaller chunks to prevent context overflow
|
| 963 |
+
chunk_overlap=25,
|
| 964 |
+
separator=" " # Explicit separator
|
| 965 |
+
)
|
| 966 |
+
|
| 967 |
+
# Process documents in smaller batches to prevent memory issues
|
| 968 |
+
batch_size = 5
|
| 969 |
+
all_nodes = []
|
| 970 |
+
|
| 971 |
+
for i in range(0, len(documents), batch_size):
|
| 972 |
+
batch = documents[i:i + batch_size]
|
| 973 |
+
logger.info(f"Processing batch {i//batch_size + 1}/{(len(documents) + batch_size - 1)//batch_size}")
|
| 974 |
+
|
| 975 |
+
nodes = sentence_splitter.get_nodes_from_documents(batch)
|
| 976 |
+
all_nodes.extend(nodes)
|
| 977 |
+
|
| 978 |
+
# Create index from nodes
|
| 979 |
+
self.index = VectorStoreIndex(
|
| 980 |
+
nodes=all_nodes,
|
| 981 |
+
show_progress=True
|
| 982 |
+
)
|
| 983 |
+
|
| 984 |
+
if save_index:
|
| 985 |
+
self.index.storage_context.persist(persist_dir=str(self.storage_dir))
|
| 986 |
+
current_hash = self._get_papers_hash()
|
| 987 |
+
self._save_papers_metadata(current_hash)
|
| 988 |
+
self.current_papers_hash = current_hash
|
| 989 |
+
logger.info("Index saved to storage")
|
| 990 |
+
|
| 991 |
+
self._create_query_engine()
|
| 992 |
+
self._create_chat_engine()
|
| 993 |
+
self.is_ready = True
|
| 994 |
+
logger.info("Vector index created successfully")
|
| 995 |
+
|
| 996 |
+
except Exception as e:
|
| 997 |
+
logger.error(f"Error creating index: {e}")
|
| 998 |
+
self.is_ready = False
|
| 999 |
+
raise
|
| 1000 |
+
|
| 1001 |
+
def should_rebuild_index(self) -> bool:
|
| 1002 |
+
"""Check if index should be rebuilt based on papers"""
|
| 1003 |
+
current_hash = self._get_papers_hash()
|
| 1004 |
+
|
| 1005 |
+
if not current_hash:
|
| 1006 |
+
return False
|
| 1007 |
+
|
| 1008 |
+
metadata = self._load_papers_metadata()
|
| 1009 |
+
|
| 1010 |
+
if not metadata:
|
| 1011 |
+
logger.info("No metadata found, rebuilding index")
|
| 1012 |
+
return True
|
| 1013 |
+
|
| 1014 |
+
if metadata.get("papers_hash") != current_hash:
|
| 1015 |
+
logger.info("Papers hash changed, rebuilding index")
|
| 1016 |
+
return True
|
| 1017 |
+
|
| 1018 |
+
if metadata.get("model_name") != self.model_name:
|
| 1019 |
+
logger.info("Model changed, rebuilding index")
|
| 1020 |
+
return True
|
| 1021 |
+
|
| 1022 |
+
return False
|
| 1023 |
+
|
| 1024 |
+
def load_index(self) -> bool:
|
| 1025 |
+
"""Load existing index from storage if it matches current papers"""
|
| 1026 |
+
try:
|
| 1027 |
+
if self.should_rebuild_index():
|
| 1028 |
+
logger.info("Index needs to be rebuilt due to changes")
|
| 1029 |
+
return False
|
| 1030 |
+
|
| 1031 |
+
index_files = list(self.storage_dir.glob("*"))
|
| 1032 |
+
if not index_files:
|
| 1033 |
+
logger.info("No index files found")
|
| 1034 |
+
return False
|
| 1035 |
+
|
| 1036 |
+
storage_context = StorageContext.from_defaults(
|
| 1037 |
+
persist_dir=str(self.storage_dir)
|
| 1038 |
+
)
|
| 1039 |
+
self.index = load_index_from_storage(storage_context)
|
| 1040 |
+
self._create_query_engine()
|
| 1041 |
+
self._create_chat_engine()
|
| 1042 |
+
self.current_papers_hash = self._get_papers_hash()
|
| 1043 |
+
self.is_ready = True
|
| 1044 |
+
|
| 1045 |
+
logger.info("Index loaded from storage successfully")
|
| 1046 |
+
return True
|
| 1047 |
+
|
| 1048 |
+
except Exception as e:
|
| 1049 |
+
logger.error(f"Error loading index: {e}")
|
| 1050 |
+
self.is_ready = False
|
| 1051 |
+
return False
|
| 1052 |
+
|
| 1053 |
+
def _create_query_engine(self):
|
| 1054 |
+
"""Create query engine with settings for detailed responses"""
|
| 1055 |
+
try:
|
| 1056 |
+
if not self.index:
|
| 1057 |
+
raise ValueError("No index available for query engine")
|
| 1058 |
+
|
| 1059 |
+
retriever = VectorIndexRetriever(
|
| 1060 |
+
index=self.index,
|
| 1061 |
+
similarity_top_k=2 # Reduced to prevent context overflow
|
| 1062 |
+
)
|
| 1063 |
+
|
| 1064 |
+
response_synthesizer = get_response_synthesizer(
|
| 1065 |
+
response_mode="compact", # More efficient for context management
|
| 1066 |
+
streaming=False,
|
| 1067 |
+
text_qa_template="""Context information is below.
|
| 1068 |
+
---------------------
|
| 1069 |
+
{context_str}
|
| 1070 |
+
---------------------
|
| 1071 |
+
Based on the context information, provide a comprehensive answer to the question. Include specific details from the research papers and explain key concepts clearly.
|
| 1072 |
+
|
| 1073 |
+
Question: {query_str}
|
| 1074 |
+
Answer: """
|
| 1075 |
+
)
|
| 1076 |
+
|
| 1077 |
+
self.query_engine = RetrieverQueryEngine(
|
| 1078 |
+
retriever=retriever,
|
| 1079 |
+
response_synthesizer=response_synthesizer
|
| 1080 |
+
)
|
| 1081 |
+
|
| 1082 |
+
logger.info("Query engine created successfully")
|
| 1083 |
+
|
| 1084 |
+
except Exception as e:
|
| 1085 |
+
logger.error(f"Error creating query engine: {e}")
|
| 1086 |
+
raise
|
| 1087 |
+
|
| 1088 |
+
def _create_chat_engine(self):
|
| 1089 |
+
"""Create chat engine for conversational interactions with conservative settings"""
|
| 1090 |
+
try:
|
| 1091 |
+
if not self.index:
|
| 1092 |
+
raise ValueError("No index available for chat engine")
|
| 1093 |
+
|
| 1094 |
+
# Create memory buffer with smaller token limit to prevent overflow
|
| 1095 |
+
memory = ChatMemoryBuffer.from_defaults(token_limit=1000)
|
| 1096 |
+
|
| 1097 |
+
# Create chat engine with conservative settings
|
| 1098 |
+
self.chat_engine = CondensePlusContextChatEngine.from_defaults(
|
| 1099 |
+
retriever=VectorIndexRetriever(
|
| 1100 |
+
index=self.index,
|
| 1101 |
+
similarity_top_k=2 # Reduced to manage context size
|
| 1102 |
+
),
|
| 1103 |
+
memory=memory,
|
| 1104 |
+
llm=self.llm,
|
| 1105 |
+
context_prompt=(
|
| 1106 |
+
"You are an expert academic research assistant. "
|
| 1107 |
+
"Use the following context to answer questions thoroughly but concisely. "
|
| 1108 |
+
"Context:\n{context_str}\n"
|
| 1109 |
+
"Answer the user's question based on the provided context."
|
| 1110 |
+
),
|
| 1111 |
+
verbose=True,
|
| 1112 |
+
# Additional context management
|
| 1113 |
+
context_window=4096, # Conservative context window
|
| 1114 |
+
max_tokens=1500 # Conservative max tokens for response
|
| 1115 |
+
)
|
| 1116 |
+
|
| 1117 |
+
logger.info("Chat engine created successfully")
|
| 1118 |
+
|
| 1119 |
+
except Exception as e:
|
| 1120 |
+
logger.error(f"Error creating chat engine: {e}")
|
| 1121 |
+
raise
|
| 1122 |
+
|
| 1123 |
+
def get_loaded_papers_info(self) -> List[str]:
|
| 1124 |
+
"""Get list of currently loaded papers"""
|
| 1125 |
+
pdf_files = list(self.data_dir.glob("*.pdf"))
|
| 1126 |
+
return [pdf_file.name for pdf_file in pdf_files]
|
| 1127 |
+
|
| 1128 |
+
def clear_papers(self):
|
| 1129 |
+
"""Clear all papers and reset index"""
|
| 1130 |
+
try:
|
| 1131 |
+
# Remove all PDF files
|
| 1132 |
+
for pdf_file in self.data_dir.glob("*.pdf"):
|
| 1133 |
+
pdf_file.unlink()
|
| 1134 |
+
|
| 1135 |
+
# Clear storage
|
| 1136 |
+
if self.storage_dir.exists():
|
| 1137 |
+
import shutil
|
| 1138 |
+
shutil.rmtree(self.storage_dir)
|
| 1139 |
+
self.storage_dir.mkdir(exist_ok=True)
|
| 1140 |
+
|
| 1141 |
+
# Reset everything
|
| 1142 |
+
self.index = None
|
| 1143 |
+
self.query_engine = None
|
| 1144 |
+
self.chat_engine = None
|
| 1145 |
+
self.current_papers_hash = None
|
| 1146 |
+
self.is_ready = False
|
| 1147 |
+
self.chat_history = []
|
| 1148 |
+
|
| 1149 |
+
logger.info("Papers and index cleared")
|
| 1150 |
+
return True
|
| 1151 |
+
|
| 1152 |
+
except Exception as e:
|
| 1153 |
+
logger.error(f"Error clearing papers: {e}")
|
| 1154 |
+
return False
|
| 1155 |
+
|
| 1156 |
+
def clear_chat_history(self):
|
| 1157 |
+
"""Clear chat history and reset memory"""
|
| 1158 |
+
try:
|
| 1159 |
+
self.chat_history = []
|
| 1160 |
+
if self.chat_engine and hasattr(self.chat_engine, 'memory'):
|
| 1161 |
+
self.chat_engine.memory.reset()
|
| 1162 |
+
logger.info("Chat history cleared")
|
| 1163 |
+
except Exception as e:
|
| 1164 |
+
logger.error(f"Error clearing chat history: {e}")
|
| 1165 |
+
|
| 1166 |
+
def process_all_papers(self) -> Dict[str, str]:
|
| 1167 |
+
"""Process all papers in the directory and create/load index"""
|
| 1168 |
+
try:
|
| 1169 |
+
current_papers = self.get_loaded_papers_info()
|
| 1170 |
+
if not current_papers:
|
| 1171 |
+
return {"error": "No papers found in directory"}
|
| 1172 |
+
|
| 1173 |
+
logger.info(f"Processing {len(current_papers)} papers: {current_papers}")
|
| 1174 |
+
|
| 1175 |
+
if self.load_index():
|
| 1176 |
+
return {"success": f"Loaded existing index for {len(current_papers)} papers"}
|
| 1177 |
+
|
| 1178 |
+
logger.info("Creating new index from documents...")
|
| 1179 |
+
documents = self.load_documents()
|
| 1180 |
+
|
| 1181 |
+
if not documents:
|
| 1182 |
+
return {"error": "Failed to load documents from PDF files"}
|
| 1183 |
+
|
| 1184 |
+
self.create_index(documents)
|
| 1185 |
+
|
| 1186 |
+
if self.is_ready:
|
| 1187 |
+
return {"success": f"Successfully created index for {len(current_papers)} papers"}
|
| 1188 |
+
else:
|
| 1189 |
+
return {"error": "Failed to create index"}
|
| 1190 |
+
|
| 1191 |
+
except Exception as e:
|
| 1192 |
+
logger.error(f"Error processing papers: {e}")
|
| 1193 |
+
return {"error": f"Error processing papers: {str(e)}"}
|
| 1194 |
+
|
| 1195 |
+
def ask_question(self, question: str, use_chat_engine: bool = True) -> Dict[str, any]:
|
| 1196 |
+
"""Ask a question using either chat engine (conversational) or query engine (standalone)"""
|
| 1197 |
+
if not self.is_ready:
|
| 1198 |
+
return {"error": "System not ready. Please process papers first."}
|
| 1199 |
+
|
| 1200 |
+
try:
|
| 1201 |
+
logger.info(f"Asking question: {question}")
|
| 1202 |
+
|
| 1203 |
+
# Truncate very long questions to prevent context overflow
|
| 1204 |
+
if len(question) > 500:
|
| 1205 |
+
question = question[:500] + "..."
|
| 1206 |
+
logger.warning("Question truncated to prevent context overflow")
|
| 1207 |
+
|
| 1208 |
+
if use_chat_engine and self.chat_engine:
|
| 1209 |
+
# Use chat engine for conversational context
|
| 1210 |
+
try:
|
| 1211 |
+
response = self.chat_engine.chat(question)
|
| 1212 |
+
answer = str(response)
|
| 1213 |
+
except Exception as chat_error:
|
| 1214 |
+
logger.warning(f"Chat engine failed, falling back to query engine: {chat_error}")
|
| 1215 |
+
# Fallback to query engine if chat engine fails
|
| 1216 |
+
response = self.query_engine.query(question)
|
| 1217 |
+
answer = str(response)
|
| 1218 |
+
use_chat_engine = False # Update flag for history tracking
|
| 1219 |
+
|
| 1220 |
+
else:
|
| 1221 |
+
# Use query engine for standalone questions
|
| 1222 |
+
response = self.query_engine.query(question)
|
| 1223 |
+
answer = str(response)
|
| 1224 |
+
|
| 1225 |
+
# Add to chat history
|
| 1226 |
+
self.chat_history.append({
|
| 1227 |
+
"timestamp": datetime.now().strftime("%H:%M:%S"),
|
| 1228 |
+
"question": question,
|
| 1229 |
+
"answer": answer,
|
| 1230 |
+
"type": "chat" if use_chat_engine else "query"
|
| 1231 |
+
})
|
| 1232 |
+
|
| 1233 |
+
# Get sources if available
|
| 1234 |
+
sources = []
|
| 1235 |
+
if hasattr(response, 'source_nodes') and response.source_nodes:
|
| 1236 |
+
for i, node in enumerate(response.source_nodes):
|
| 1237 |
+
sources.append({
|
| 1238 |
+
'text': node.text[:300] + "..." if len(node.text) > 300 else node.text,
|
| 1239 |
+
'score': node.score if hasattr(node, 'score') else 'N/A'
|
| 1240 |
+
})
|
| 1241 |
+
|
| 1242 |
+
logger.info(f"Generated answer length: {len(answer)} characters")
|
| 1243 |
+
|
| 1244 |
+
return {
|
| 1245 |
+
"answer": answer,
|
| 1246 |
+
"sources": sources,
|
| 1247 |
+
"timestamp": datetime.now().strftime("%H:%M:%S")
|
| 1248 |
+
}
|
| 1249 |
+
|
| 1250 |
+
except Exception as e:
|
| 1251 |
+
logger.error(f"Error answering question: {e}")
|
| 1252 |
+
return {"error": f"Error processing question: {str(e)}"}
|
| 1253 |
+
|
| 1254 |
+
def create_streamlit_app():
|
| 1255 |
+
"""Create Streamlit web interface with chat functionality"""
|
| 1256 |
+
st.set_page_config(
|
| 1257 |
+
page_title="Academic Paper Q&A Bot (Groq Powered)",
|
| 1258 |
+
page_icon="π¬",
|
| 1259 |
+
layout="wide"
|
| 1260 |
+
)
|
| 1261 |
+
|
| 1262 |
+
st.title("π¬ Academic Paper Q&A Bot (Groq Powered)")
|
| 1263 |
+
|
| 1264 |
+
# Custom CSS for chat interface
|
| 1265 |
+
st.markdown("""
|
| 1266 |
+
<style>
|
| 1267 |
+
.chat-message {
|
| 1268 |
+
padding: 1rem;
|
| 1269 |
+
border-radius: 0.5rem;
|
| 1270 |
+
margin-bottom: 1rem;
|
| 1271 |
+
display: flex;
|
| 1272 |
+
flex-direction: column;
|
| 1273 |
+
}
|
| 1274 |
+
.user-message {
|
| 1275 |
+
background-color: #e3f2fd;
|
| 1276 |
+
margin-left: 20%;
|
| 1277 |
+
}
|
| 1278 |
+
.bot-message {
|
| 1279 |
+
background-color: #f5f5f5;
|
| 1280 |
+
margin-right: 20%;
|
| 1281 |
+
}
|
| 1282 |
+
.message-content {
|
| 1283 |
+
margin: 0.5rem 0;
|
| 1284 |
+
}
|
| 1285 |
+
.message-timestamp {
|
| 1286 |
+
font-size: 0.8rem;
|
| 1287 |
+
color: #666;
|
| 1288 |
+
align-self: flex-end;
|
| 1289 |
+
}
|
| 1290 |
+
.stChatInputContainer {
|
| 1291 |
+
position: fixed;
|
| 1292 |
+
bottom: 0;
|
| 1293 |
+
background: white;
|
| 1294 |
+
padding: 1rem;
|
| 1295 |
+
border-top: 1px solid #e0e0e0;
|
| 1296 |
+
}
|
| 1297 |
+
</style>
|
| 1298 |
+
""", unsafe_allow_html=True)
|
| 1299 |
+
|
| 1300 |
+
# API Key configuration in sidebar
|
| 1301 |
+
st.sidebar.header("π API Configuration")
|
| 1302 |
+
groq_api_key = st.sidebar.text_input(
|
| 1303 |
+
"Groq API Key:",
|
| 1304 |
+
type="password",
|
| 1305 |
+
help="Get your free API key from https://console.groq.com/keys"
|
| 1306 |
+
)
|
| 1307 |
+
|
| 1308 |
+
if not groq_api_key:
|
| 1309 |
+
groq_api_key = os.getenv("GROQ_API_KEY")
|
| 1310 |
+
|
| 1311 |
+
if not groq_api_key:
|
| 1312 |
+
st.sidebar.error("Please enter your Groq API key or set GROQ_API_KEY environment variable")
|
| 1313 |
+
st.info("π **To get started:**\n1. Go to https://console.groq.com/keys\n2. Create a free account\n3. Generate an API key\n4. Enter it in the sidebar")
|
| 1314 |
+
st.stop()
|
| 1315 |
+
|
| 1316 |
+
# Model selection in sidebar
|
| 1317 |
+
st.sidebar.header("βοΈ Configuration")
|
| 1318 |
+
model_options = {
|
| 1319 |
+
"Llama3 8B (Fast & Stable)": "llama3-8b-8192",
|
| 1320 |
+
"Llama3 70B (Most Capable)": "llama3-70b-8192",
|
| 1321 |
+
"Mixtral 8x7B (Balanced)": "mixtral-8x7b-32768",
|
| 1322 |
+
"Gemma 7B (Efficient)": "gemma-7b-it"
|
| 1323 |
+
}
|
| 1324 |
+
|
| 1325 |
+
selected_model = st.sidebar.selectbox(
|
| 1326 |
+
"Choose Groq Model:",
|
| 1327 |
+
list(model_options.keys()),
|
| 1328 |
+
index=0 # Default to the more stable 8B model
|
| 1329 |
+
)
|
| 1330 |
+
|
| 1331 |
+
model_name = model_options[selected_model]
|
| 1332 |
+
|
| 1333 |
+
# Initialize session state
|
| 1334 |
+
if ('qa_system' not in st.session_state or
|
| 1335 |
+
st.session_state.get('current_model') != model_name or
|
| 1336 |
+
st.session_state.get('current_api_key') != groq_api_key):
|
| 1337 |
+
|
| 1338 |
+
with st.spinner(f"Initializing system with {selected_model}..."):
|
| 1339 |
+
try:
|
| 1340 |
+
st.session_state.qa_system = AcademicPaperQA(
|
| 1341 |
+
model_name=model_name,
|
| 1342 |
+
groq_api_key=groq_api_key
|
| 1343 |
+
)
|
| 1344 |
+
st.session_state.current_model = model_name
|
| 1345 |
+
st.session_state.current_api_key = groq_api_key
|
| 1346 |
+
st.session_state.papers_loaded = False
|
| 1347 |
+
st.success(f"System initialized with {selected_model} via Groq API!")
|
| 1348 |
+
except Exception as e:
|
| 1349 |
+
st.error(f"Error initializing system: {e}")
|
| 1350 |
+
st.info("Please check your Groq API key and try again.")
|
| 1351 |
+
st.stop()
|
| 1352 |
+
|
| 1353 |
+
if 'papers_loaded' not in st.session_state:
|
| 1354 |
+
st.session_state.papers_loaded = False
|
| 1355 |
+
|
| 1356 |
+
# Display current model info
|
| 1357 |
+
st.sidebar.info(f"**Current model:** {selected_model}")
|
| 1358 |
+
st.sidebar.success("β
Using Groq API (Cloud)")
|
| 1359 |
+
st.sidebar.info("π¬ Conversational Mode: ON")
|
| 1360 |
+
|
| 1361 |
+
# Show system status
|
| 1362 |
+
if hasattr(st.session_state.qa_system, 'is_ready'):
|
| 1363 |
+
if st.session_state.qa_system.is_ready:
|
| 1364 |
+
st.sidebar.success("β
System Ready")
|
| 1365 |
+
else:
|
| 1366 |
+
st.sidebar.warning("β οΈ Process papers first")
|
| 1367 |
+
|
| 1368 |
+
# Show currently loaded papers
|
| 1369 |
+
current_papers = st.session_state.qa_system.get_loaded_papers_info()
|
| 1370 |
+
if current_papers:
|
| 1371 |
+
st.sidebar.subheader("π Loaded Papers:")
|
| 1372 |
+
for paper in current_papers:
|
| 1373 |
+
st.sidebar.text(f"π {paper}")
|
| 1374 |
+
|
| 1375 |
+
if st.sidebar.button("ποΈ Clear All Papers"):
|
| 1376 |
+
with st.spinner("Clearing papers..."):
|
| 1377 |
+
if st.session_state.qa_system.clear_papers():
|
| 1378 |
+
st.session_state.papers_loaded = False
|
| 1379 |
+
st.sidebar.success("Papers cleared!")
|
| 1380 |
+
st.rerun()
|
| 1381 |
+
|
| 1382 |
+
# Chat controls in sidebar
|
| 1383 |
+
st.sidebar.subheader("π¬ Chat Controls")
|
| 1384 |
+
if st.sidebar.button("π§Ή Clear Chat History"):
|
| 1385 |
+
st.session_state.qa_system.clear_chat_history()
|
| 1386 |
+
st.sidebar.success("Chat cleared!")
|
| 1387 |
+
st.rerun()
|
| 1388 |
+
|
| 1389 |
+
# Main interface
|
| 1390 |
+
if not st.session_state.qa_system.is_ready:
|
| 1391 |
+
# Show paper loading interface when system not ready
|
| 1392 |
+
st.header("π₯ Load Academic Papers")
|
| 1393 |
+
|
| 1394 |
+
col1, col2 = st.columns([1, 1])
|
| 1395 |
+
|
| 1396 |
+
with col1:
|
| 1397 |
+
st.subheader("From arXiv")
|
| 1398 |
+
arxiv_id = st.text_input("Enter arXiv ID (e.g., 2301.00001)")
|
| 1399 |
+
if st.button("Download from arXiv"):
|
| 1400 |
+
if arxiv_id:
|
| 1401 |
+
with st.spinner("Downloading paper..."):
|
| 1402 |
+
filepath = st.session_state.qa_system.download_arxiv_paper(arxiv_id)
|
| 1403 |
+
if filepath:
|
| 1404 |
+
st.success(f"Downloaded paper")
|
| 1405 |
+
st.session_state.papers_loaded = False
|
| 1406 |
+
else:
|
| 1407 |
+
st.error("Failed to download paper")
|
| 1408 |
+
|
| 1409 |
+
with col2:
|
| 1410 |
+
st.subheader("Upload PDF Files")
|
| 1411 |
+
uploaded_files = st.file_uploader(
|
| 1412 |
+
"Choose PDF files",
|
| 1413 |
+
type="pdf",
|
| 1414 |
+
accept_multiple_files=True
|
| 1415 |
+
)
|
| 1416 |
+
|
| 1417 |
+
if uploaded_files:
|
| 1418 |
+
saved_files = []
|
| 1419 |
+
for uploaded_file in uploaded_files:
|
| 1420 |
+
file_path = st.session_state.qa_system.data_dir / uploaded_file.name
|
| 1421 |
+
with open(file_path, "wb") as f:
|
| 1422 |
+
f.write(uploaded_file.getbuffer())
|
| 1423 |
+
saved_files.append(str(file_path))
|
| 1424 |
+
|
| 1425 |
+
st.success(f"Uploaded {len(saved_files)} files")
|
| 1426 |
+
st.session_state.papers_loaded = False
|
| 1427 |
+
|
| 1428 |
+
# Process papers
|
| 1429 |
+
st.subheader("π Process Papers")
|
| 1430 |
+
current_papers = st.session_state.qa_system.get_loaded_papers_info()
|
| 1431 |
+
|
| 1432 |
+
if not current_papers:
|
| 1433 |
+
st.info("No papers found. Please upload or download papers first.")
|
| 1434 |
+
else:
|
| 1435 |
+
st.info(f"Found {len(current_papers)} paper(s): {', '.join(current_papers)}")
|
| 1436 |
+
|
| 1437 |
+
if st.button("π Process Papers", type="primary"):
|
| 1438 |
+
with st.spinner("Processing papers (creating embeddings on CPU)..."):
|
| 1439 |
+
result = st.session_state.qa_system.process_all_papers()
|
| 1440 |
+
|
| 1441 |
+
if "error" in result:
|
| 1442 |
+
st.error(result["error"])
|
| 1443 |
+
st.session_state.papers_loaded = False
|
| 1444 |
+
else:
|
| 1445 |
+
st.success(result["success"])
|
| 1446 |
+
st.session_state.papers_loaded = True
|
| 1447 |
+
st.rerun()
|
| 1448 |
+
|
| 1449 |
+
else:
|
| 1450 |
+
# Main chat interface when system is ready
|
| 1451 |
+
st.header("π¬ Chat with Your Papers")
|
| 1452 |
+
|
| 1453 |
+
# Show loaded papers info
|
| 1454 |
+
loaded_papers = st.session_state.qa_system.get_loaded_papers_info()
|
| 1455 |
+
st.info(f"π Chatting with {len(loaded_papers)} paper(s): {', '.join(loaded_papers)}")
|
| 1456 |
+
|
| 1457 |
+
# Chat history display
|
| 1458 |
+
chat_container = st.container()
|
| 1459 |
+
|
| 1460 |
+
with chat_container:
|
| 1461 |
+
# Display chat history
|
| 1462 |
+
for i, message in enumerate(st.session_state.qa_system.chat_history[-10:]): # Show last 10 messages
|
| 1463 |
+
|
| 1464 |
+
|
| 1465 |
+
st.markdown(f"""
|
| 1466 |
+
<div class="chat-message user-message">
|
| 1467 |
+
<div class="message-content" style="color: black;">
|
| 1468 |
+
<strong>You:</strong> {message['question']}
|
| 1469 |
+
</div>
|
| 1470 |
+
<div class="message-timestamp">{message['timestamp']}</div>
|
| 1471 |
+
</div>
|
| 1472 |
+
""", unsafe_allow_html=True)
|
| 1473 |
+
|
| 1474 |
+
|
| 1475 |
+
|
| 1476 |
+
# Bot response
|
| 1477 |
+
st.markdown(f"""
|
| 1478 |
+
<div class="chat-message bot-message">
|
| 1479 |
+
<div class="message-content"><strong style="color: black;">Assistant:</strong></div>
|
| 1480 |
+
</div>
|
| 1481 |
+
""", unsafe_allow_html=True)
|
| 1482 |
+
|
| 1483 |
+
st.write(message['answer'])
|
| 1484 |
+
st.markdown("---")
|
| 1485 |
+
|
| 1486 |
+
# Quick question buttons
|
| 1487 |
+
st.subheader("π Quick Questions")
|
| 1488 |
+
col1, col2, col3 = st.columns(3)
|
| 1489 |
+
|
| 1490 |
+
quick_question = None
|
| 1491 |
+
with col1:
|
| 1492 |
+
if st.button("π― Main Research Question"):
|
| 1493 |
+
quick_question = "What is the main research question addressed in this paper?"
|
| 1494 |
+
if st.button("π¬ Methodology"):
|
| 1495 |
+
quick_question = "What methodology was used in this study?"
|
| 1496 |
+
|
| 1497 |
+
with col2:
|
| 1498 |
+
if st.button("π Key Findings"):
|
| 1499 |
+
quick_question = "What are the key findings of this research?"
|
| 1500 |
+
if st.button("π― Conclusions"):
|
| 1501 |
+
quick_question = "What are the main conclusions of this research?"
|
| 1502 |
+
|
| 1503 |
+
with col3:
|
| 1504 |
+
if st.button("β οΈ Limitations"):
|
| 1505 |
+
quick_question = "What are the limitations of this study?"
|
| 1506 |
+
if st.button("π Summary"):
|
| 1507 |
+
quick_question = "Please provide a summary of this paper."
|
| 1508 |
+
|
| 1509 |
+
# Chat input
|
| 1510 |
+
st.subheader("π Ask Your Question")
|
| 1511 |
+
user_question = st.text_area("Type your question here...", height=100, placeholder="Ask anything about your papers...")
|
| 1512 |
+
|
| 1513 |
+
# Use quick question if selected, otherwise use user input
|
| 1514 |
+
question_to_ask = quick_question if quick_question else user_question
|
| 1515 |
+
|
| 1516 |
+
if st.button("Send Message", type="primary", disabled=not question_to_ask):
|
| 1517 |
+
if question_to_ask:
|
| 1518 |
+
with st.spinner("Thinking... (Processing via Groq API)"):
|
| 1519 |
+
result = st.session_state.qa_system.ask_question(
|
| 1520 |
+
question_to_ask,
|
| 1521 |
+
use_chat_engine=True # Always use conversational mode
|
| 1522 |
+
)
|
| 1523 |
+
|
| 1524 |
+
if "error" in result:
|
| 1525 |
+
st.error(result["error"])
|
| 1526 |
+
else:
|
| 1527 |
+
st.rerun() # Reload to show new message
|
| 1528 |
+
|
| 1529 |
+
# Sources section (show for last question if available)
|
| 1530 |
+
if (st.session_state.qa_system.chat_history and
|
| 1531 |
+
st.session_state.qa_system.chat_history[-1].get('sources')):
|
| 1532 |
+
|
| 1533 |
+
with st.expander("π View Sources", expanded=False):
|
| 1534 |
+
sources = st.session_state.qa_system.chat_history[-1]['sources']
|
| 1535 |
+
for i, source in enumerate(sources, 1):
|
| 1536 |
+
st.markdown(f"**Source {i}** (Relevance: {source['score']})")
|
| 1537 |
+
st.text(source['text'])
|
| 1538 |
+
st.markdown("---")
|
| 1539 |
|
| 1540 |
+
if __name__ == "__main__":
|
| 1541 |
+
create_streamlit_app()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|