Update src/streamlit_app.py
Browse files- src/streamlit_app.py +61 -149
src/streamlit_app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
# ==========================================================
|
| 2 |
-
# streamlit_app.py β
|
| 3 |
# ==========================================================
|
| 4 |
import os
|
| 5 |
import re
|
|
@@ -29,50 +29,6 @@ os.environ.update({
|
|
| 29 |
"HF_MODULES_CACHE": CACHE_DIR,
|
| 30 |
})
|
| 31 |
|
| 32 |
-
# ==========================================================
|
| 33 |
-
# π§ SMART SUGGESTION GENERATOR
|
| 34 |
-
# ==========================================================
|
| 35 |
-
def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
|
| 36 |
-
if not toc or not chunks:
|
| 37 |
-
return ["How do I start using this guide?", "What does this document cover?"]
|
| 38 |
-
|
| 39 |
-
titles = []
|
| 40 |
-
for sec, raw_title in toc:
|
| 41 |
-
title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
|
| 42 |
-
title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
|
| 43 |
-
if 4 < len(title) < 120:
|
| 44 |
-
titles.append(title)
|
| 45 |
-
|
| 46 |
-
context_sample = " ".join(chunks[:3])[:4000]
|
| 47 |
-
prompt = f"""
|
| 48 |
-
You are a content assistant. Based on the TOC and text, generate 5β7 short natural questions.
|
| 49 |
-
Each question <18 words, ends with '?', sounds human. Document: "{doc_name}"
|
| 50 |
-
TOC:
|
| 51 |
-
{chr(10).join(['- ' + t for t in titles[:8]])}
|
| 52 |
-
Sample:
|
| 53 |
-
{context_sample}
|
| 54 |
-
"""
|
| 55 |
-
|
| 56 |
-
try:
|
| 57 |
-
ai_response = genai_generate(prompt)
|
| 58 |
-
lines = [ln.strip() for ln in ai_response.splitlines() if ln.strip()]
|
| 59 |
-
out = []
|
| 60 |
-
for ln in lines:
|
| 61 |
-
q = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
|
| 62 |
-
if not q.endswith("?") and re.match(r"(?i)^(what|how|why|where|who|when|which|can|does|is|are)\b", q):
|
| 63 |
-
q += "?"
|
| 64 |
-
if 8 <= len(q) <= 140:
|
| 65 |
-
out.append(q)
|
| 66 |
-
uniq = []
|
| 67 |
-
seen = set()
|
| 68 |
-
for q in out:
|
| 69 |
-
if q.lower() not in seen:
|
| 70 |
-
seen.add(q.lower())
|
| 71 |
-
uniq.append(q)
|
| 72 |
-
return uniq[:7] or [f"What should I know about {t.rstrip('.')}?" for t in titles[:7]]
|
| 73 |
-
except Exception:
|
| 74 |
-
return ["How do I start using this guide?", "What does this document cover?"]
|
| 75 |
-
|
| 76 |
# ==========================================================
|
| 77 |
# π¨ STYLING
|
| 78 |
# ==========================================================
|
|
@@ -80,17 +36,12 @@ st.markdown("""
|
|
| 80 |
<style>
|
| 81 |
div.block-container {padding-top:1.2rem;max-width:1080px;}
|
| 82 |
h1,h2,h3{color:#f3f4f6;font-weight:600;}
|
| 83 |
-
.suggest-chip{background:#0f1724;border:1px solid #374151;border-radius:14px;
|
| 84 |
-
color:#e6eef8;padding:8px 12px;cursor:pointer;font-size:13px;margin:6px 6px 6px 0;
|
| 85 |
-
display:inline-block;transition:background 0.2s,transform 0.1s;}
|
| 86 |
-
.suggest-chip:hover{background:#1e3a8a;transform:translateY(-2px);}
|
| 87 |
.answer-box{background:linear-gradient(180deg,#0b1220,#071027);
|
| 88 |
border-left:4px solid #3b82f6;border-radius:8px;padding:16px 18px;color:#e6eef8;
|
| 89 |
margin-top:12px;box-shadow:0 4px 14px rgba(0,0,0,0.35);}
|
| 90 |
.stTextInput>div>div>input{background-color:#0f172a!important;color:#f1f5f9!important;
|
| 91 |
border-radius:6px!important;border:1px solid #334155!important;padding:8px 10px!important;
|
| 92 |
font-size:15px!important;}
|
| 93 |
-
.small-link{font-size:13px;color:#60a5fa;cursor:pointer;}
|
| 94 |
</style>
|
| 95 |
""", unsafe_allow_html=True)
|
| 96 |
|
|
@@ -107,14 +58,13 @@ with st.sidebar:
|
|
| 107 |
)
|
| 108 |
st.markdown("---")
|
| 109 |
|
| 110 |
-
# π Registry display
|
| 111 |
if "registry" in st.session_state:
|
| 112 |
registry = st.session_state["registry"]
|
| 113 |
-
|
| 114 |
-
if
|
| 115 |
with st.expander("π Registered Documents", expanded=False):
|
| 116 |
-
for i,
|
| 117 |
-
st.markdown(f"**{i}. {
|
| 118 |
else:
|
| 119 |
st.caption("π No documents registered yet.")
|
| 120 |
else:
|
|
@@ -123,7 +73,6 @@ with st.sidebar:
|
|
| 123 |
st.markdown("---")
|
| 124 |
show_dev = st.checkbox("Show advanced settings (for developers)", value=False)
|
| 125 |
if show_dev:
|
| 126 |
-
st.markdown("### βοΈ Developer Options")
|
| 127 |
chunk_size = st.slider("Chunk Size", 200, 1500, 1000, step=50)
|
| 128 |
overlap = st.slider("Chunk Overlap", 50, 200, 120, step=10)
|
| 129 |
top_k = st.slider("Top K Results", 1, 10, 5)
|
|
@@ -132,32 +81,12 @@ with st.sidebar:
|
|
| 132 |
st.markdown("---")
|
| 133 |
st.caption("β¨ Built by Shubham Sharma")
|
| 134 |
|
| 135 |
-
# ==========================================================
|
| 136 |
-
# π§ SESSION STATE
|
| 137 |
-
# ==========================================================
|
| 138 |
-
for key, val in {
|
| 139 |
-
"user_query_input": "",
|
| 140 |
-
"show_more": False,
|
| 141 |
-
"selected_suggestion": None,
|
| 142 |
-
"query_suggestions_fixed": None,
|
| 143 |
-
"last_doc": None,
|
| 144 |
-
"doc_lang": "en",
|
| 145 |
-
}.items():
|
| 146 |
-
if key not in st.session_state:
|
| 147 |
-
st.session_state[key] = val
|
| 148 |
-
|
| 149 |
-
def set_user_query(q, idx):
|
| 150 |
-
st.session_state["user_query_input"] = q
|
| 151 |
-
st.session_state["selected_suggestion"] = idx
|
| 152 |
-
st.experimental_rerun()
|
| 153 |
-
|
| 154 |
# ==========================================================
|
| 155 |
# π MAIN SECTION
|
| 156 |
# ==========================================================
|
| 157 |
st.title("π Enterprise Knowledge Assistant")
|
| 158 |
st.caption("Query SAP documentation and enterprise PDFs β powered by reasoning and retrieval.")
|
| 159 |
|
| 160 |
-
# β
FIXED: must be defined before document-handling logic
|
| 161 |
doc_choice = st.radio(
|
| 162 |
"Select a document:",
|
| 163 |
["-- Select --", "Sample PDF", "Upload Custom PDF"],
|
|
@@ -165,7 +94,7 @@ doc_choice = st.radio(
|
|
| 165 |
)
|
| 166 |
|
| 167 |
# ==========================================================
|
| 168 |
-
# π DOCUMENT HANDLING
|
| 169 |
# ==========================================================
|
| 170 |
def _hash_content(file_path):
|
| 171 |
h = hashlib.sha256()
|
|
@@ -174,11 +103,6 @@ def _hash_content(file_path):
|
|
| 174 |
h.update(chunk)
|
| 175 |
return h.hexdigest()[:12]
|
| 176 |
|
| 177 |
-
def refresh_suggestions(doc_name, toc, chunks):
|
| 178 |
-
st.session_state["query_suggestions_fixed"] = generate_dynamic_suggestions_from_toc(
|
| 179 |
-
toc, chunks, doc_name
|
| 180 |
-
)
|
| 181 |
-
st.session_state.update({"user_query_input": "", "selected_suggestion": None, "show_more": False})
|
| 182 |
|
| 183 |
if doc_choice == "-- Select --":
|
| 184 |
st.info("β¬
οΈ Select or upload a document to begin.")
|
|
@@ -198,96 +122,84 @@ else:
|
|
| 198 |
|
| 199 |
if temp_path:
|
| 200 |
doc_name = os.path.basename(temp_path)
|
| 201 |
-
|
| 202 |
-
doc_identifier = f"{doc_name}_{
|
| 203 |
|
| 204 |
-
#
|
| 205 |
if "registry" not in st.session_state:
|
| 206 |
st.session_state["registry"] = DocumentRegistry()
|
| 207 |
registry = st.session_state["registry"]
|
| 208 |
|
| 209 |
-
|
| 210 |
-
if
|
| 211 |
-
|
|
|
|
| 212 |
st.session_state.update({
|
| 213 |
"text": doc_data.get("text", ""),
|
| 214 |
"toc": doc_data.get("toc", []),
|
| 215 |
"chunks": doc_data.get("chunks", []),
|
| 216 |
"embeddings": doc_data.get("embeddings"),
|
| 217 |
"index": doc_data.get("index"),
|
| 218 |
-
"doc_ready": True
|
| 219 |
-
"active_doc": existing["name"],
|
| 220 |
-
"status_text": f"β
{doc_name} already processed β loaded from registry."
|
| 221 |
})
|
| 222 |
-
|
| 223 |
-
st.
|
| 224 |
-
|
| 225 |
-
status = st.empty()
|
| 226 |
-
status.info("π€ Upload complete β reading document...")
|
| 227 |
-
|
| 228 |
-
text, toc, toc_source = extract_text_from_pdf(temp_path)
|
| 229 |
-
status.info("π Parsing and chunking document...")
|
| 230 |
-
chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
|
| 231 |
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
refresh_suggestions(doc_name, toc, chunks)
|
| 240 |
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
st.experimental_rerun()
|
| 252 |
|
| 253 |
-
|
| 254 |
-
st.
|
| 255 |
-
|
|
|
|
| 256 |
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
visible = suggs if st.session_state["show_more"] else suggs[:3]
|
| 260 |
cols = st.columns(min(3, len(visible)))
|
| 261 |
for i, q in enumerate(visible):
|
| 262 |
if cols[i % 3].button(f"π¬ {q}", key=f"sugg_{i}"):
|
| 263 |
-
|
|
|
|
| 264 |
|
| 265 |
toggle_text = "Show less β²" if st.session_state["show_more"] else "Show more βΌ"
|
| 266 |
if st.button(toggle_text):
|
| 267 |
st.session_state["show_more"] = not st.session_state["show_more"]
|
| 268 |
st.experimental_rerun()
|
| 269 |
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
# π¨ Sidebar scroll style
|
| 288 |
-
# ==========================================================
|
| 289 |
-
st.markdown("""
|
| 290 |
-
<style>
|
| 291 |
-
section[data-testid="stSidebar"] div.stExpander {max-height:480px;overflow-y:auto;}
|
| 292 |
-
</style>
|
| 293 |
-
""", unsafe_allow_html=True)
|
|
|
|
| 1 |
# ==========================================================
|
| 2 |
+
# streamlit_app.py β Commit 2 (Stable)
|
| 3 |
# ==========================================================
|
| 4 |
import os
|
| 5 |
import re
|
|
|
|
| 29 |
"HF_MODULES_CACHE": CACHE_DIR,
|
| 30 |
})
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
# ==========================================================
|
| 33 |
# π¨ STYLING
|
| 34 |
# ==========================================================
|
|
|
|
| 36 |
<style>
|
| 37 |
div.block-container {padding-top:1.2rem;max-width:1080px;}
|
| 38 |
h1,h2,h3{color:#f3f4f6;font-weight:600;}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
.answer-box{background:linear-gradient(180deg,#0b1220,#071027);
|
| 40 |
border-left:4px solid #3b82f6;border-radius:8px;padding:16px 18px;color:#e6eef8;
|
| 41 |
margin-top:12px;box-shadow:0 4px 14px rgba(0,0,0,0.35);}
|
| 42 |
.stTextInput>div>div>input{background-color:#0f172a!important;color:#f1f5f9!important;
|
| 43 |
border-radius:6px!important;border:1px solid #334155!important;padding:8px 10px!important;
|
| 44 |
font-size:15px!important;}
|
|
|
|
| 45 |
</style>
|
| 46 |
""", unsafe_allow_html=True)
|
| 47 |
|
|
|
|
| 58 |
)
|
| 59 |
st.markdown("---")
|
| 60 |
|
|
|
|
| 61 |
if "registry" in st.session_state:
|
| 62 |
registry = st.session_state["registry"]
|
| 63 |
+
registered_docs = registry.list_docs() if hasattr(registry, "list_docs") else []
|
| 64 |
+
if registered_docs:
|
| 65 |
with st.expander("π Registered Documents", expanded=False):
|
| 66 |
+
for i, doc in enumerate(registered_docs, start=1):
|
| 67 |
+
st.markdown(f"**{i}. {doc.get('name','?')}** β {doc.get('num_chunks','?')} chunks *(TOC: {doc.get('toc_source','β')})*")
|
| 68 |
else:
|
| 69 |
st.caption("π No documents registered yet.")
|
| 70 |
else:
|
|
|
|
| 73 |
st.markdown("---")
|
| 74 |
show_dev = st.checkbox("Show advanced settings (for developers)", value=False)
|
| 75 |
if show_dev:
|
|
|
|
| 76 |
chunk_size = st.slider("Chunk Size", 200, 1500, 1000, step=50)
|
| 77 |
overlap = st.slider("Chunk Overlap", 50, 200, 120, step=10)
|
| 78 |
top_k = st.slider("Top K Results", 1, 10, 5)
|
|
|
|
| 81 |
st.markdown("---")
|
| 82 |
st.caption("β¨ Built by Shubham Sharma")
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
# ==========================================================
|
| 85 |
# π MAIN SECTION
|
| 86 |
# ==========================================================
|
| 87 |
st.title("π Enterprise Knowledge Assistant")
|
| 88 |
st.caption("Query SAP documentation and enterprise PDFs β powered by reasoning and retrieval.")
|
| 89 |
|
|
|
|
| 90 |
doc_choice = st.radio(
|
| 91 |
"Select a document:",
|
| 92 |
["-- Select --", "Sample PDF", "Upload Custom PDF"],
|
|
|
|
| 94 |
)
|
| 95 |
|
| 96 |
# ==========================================================
|
| 97 |
+
# π DOCUMENT HANDLING (Commit 2)
|
| 98 |
# ==========================================================
|
| 99 |
def _hash_content(file_path):
|
| 100 |
h = hashlib.sha256()
|
|
|
|
| 103 |
h.update(chunk)
|
| 104 |
return h.hexdigest()[:12]
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
if doc_choice == "-- Select --":
|
| 108 |
st.info("β¬
οΈ Select or upload a document to begin.")
|
|
|
|
| 122 |
|
| 123 |
if temp_path:
|
| 124 |
doc_name = os.path.basename(temp_path)
|
| 125 |
+
file_hash = _hash_content(temp_path)
|
| 126 |
+
doc_identifier = f"{doc_name}_{file_hash}"
|
| 127 |
|
| 128 |
+
# β
Registry initialization
|
| 129 |
if "registry" not in st.session_state:
|
| 130 |
st.session_state["registry"] = DocumentRegistry()
|
| 131 |
registry = st.session_state["registry"]
|
| 132 |
|
| 133 |
+
# β
Reuse if already processed
|
| 134 |
+
if doc_name in [d["name"] for d in registry.list_docs()]:
|
| 135 |
+
st.session_state["status_text"] = f"β
{doc_name} already processed β loaded from registry."
|
| 136 |
+
doc_data = registry.get_doc(doc_name)
|
| 137 |
st.session_state.update({
|
| 138 |
"text": doc_data.get("text", ""),
|
| 139 |
"toc": doc_data.get("toc", []),
|
| 140 |
"chunks": doc_data.get("chunks", []),
|
| 141 |
"embeddings": doc_data.get("embeddings"),
|
| 142 |
"index": doc_data.get("index"),
|
| 143 |
+
"doc_ready": True
|
|
|
|
|
|
|
| 144 |
})
|
| 145 |
+
else:
|
| 146 |
+
status = st.empty()
|
| 147 |
+
status.info("π€ Upload complete β reading document...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
+
text, toc, toc_source = extract_text_from_pdf(temp_path)
|
| 150 |
+
status.info("π Parsing and chunking document...")
|
| 151 |
+
chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
|
| 152 |
+
status.info("π§ Building embeddings and search index...")
|
| 153 |
+
embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
|
| 154 |
+
index = build_faiss_index(embeddings)
|
| 155 |
|
| 156 |
+
doc_id = registry.register(temp_path, chunks, embeddings, index)
|
| 157 |
+
st.session_state["active_doc"] = doc_id
|
| 158 |
+
status.success("β
Document processed successfully β ready to query!")
|
|
|
|
| 159 |
|
| 160 |
+
st.session_state.update({
|
| 161 |
+
"text": text,
|
| 162 |
+
"toc": toc,
|
| 163 |
+
"chunks": chunks,
|
| 164 |
+
"embeddings": embeddings,
|
| 165 |
+
"index": index,
|
| 166 |
+
"doc_ready": True,
|
| 167 |
+
"last_doc": doc_identifier,
|
| 168 |
+
"status_text": "β
Document processed successfully β ready to query!"
|
| 169 |
+
})
|
|
|
|
| 170 |
|
| 171 |
+
# --- Ask section ---
|
| 172 |
+
if st.session_state.get("doc_ready"):
|
| 173 |
+
st.info(st.session_state.get("status_text", "π Ready for queries."))
|
| 174 |
+
st.markdown("### π¬ Ask the Assistant")
|
| 175 |
|
| 176 |
+
query_suggestions = ["How do I start using this guide?", "What are the prerequisites?", "What is covered in this document?"]
|
| 177 |
+
visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
|
|
|
|
| 178 |
cols = st.columns(min(3, len(visible)))
|
| 179 |
for i, q in enumerate(visible):
|
| 180 |
if cols[i % 3].button(f"π¬ {q}", key=f"sugg_{i}"):
|
| 181 |
+
st.session_state["user_query_input"] = q
|
| 182 |
+
st.experimental_rerun()
|
| 183 |
|
| 184 |
toggle_text = "Show less β²" if st.session_state["show_more"] else "Show more βΌ"
|
| 185 |
if st.button(toggle_text):
|
| 186 |
st.session_state["show_more"] = not st.session_state["show_more"]
|
| 187 |
st.experimental_rerun()
|
| 188 |
|
| 189 |
+
user_query = st.text_input("Type your question or click one above:", key="user_query_input")
|
| 190 |
+
|
| 191 |
+
if user_query.strip():
|
| 192 |
+
reasoning_mode = mode == "Extended (Document + General)"
|
| 193 |
+
with st.spinner("π Generating your answer..."):
|
| 194 |
+
retrieved = retrieve_chunks(
|
| 195 |
+
user_query,
|
| 196 |
+
st.session_state["index"],
|
| 197 |
+
st.session_state["chunks"],
|
| 198 |
+
top_k=top_k,
|
| 199 |
+
embeddings=st.session_state["embeddings"]
|
| 200 |
+
)
|
| 201 |
+
answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
|
| 202 |
+
st.session_state["retrieved"] = retrieved
|
| 203 |
+
|
| 204 |
+
st.markdown("### π€ Assistantβs Answer")
|
| 205 |
+
st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|