Update src/streamlit_app.py
Browse files- src/streamlit_app.py +61 -98
src/streamlit_app.py
CHANGED
|
@@ -24,17 +24,15 @@ st.set_page_config(
|
|
| 24 |
)
|
| 25 |
|
| 26 |
# ==========================================================
|
| 27 |
-
# π§Ή Cache Management
|
| 28 |
# ==========================================================
|
| 29 |
def clean_cache(max_size_gb: float = 2.0):
|
| 30 |
-
"""Cleans large cache folders (> max_size_gb)."""
|
| 31 |
folders = [
|
| 32 |
"/root/.cache/huggingface",
|
| 33 |
"/root/.cache/transformers",
|
| 34 |
"/root/.cache/torch",
|
| 35 |
]
|
| 36 |
total_deleted = 0.0
|
| 37 |
-
|
| 38 |
for folder in folders:
|
| 39 |
if os.path.exists(folder):
|
| 40 |
size_gb = sum(
|
|
@@ -45,13 +43,10 @@ def clean_cache(max_size_gb: float = 2.0):
|
|
| 45 |
if size_gb > max_size_gb or "torch" in folder:
|
| 46 |
shutil.rmtree(folder, ignore_errors=True)
|
| 47 |
total_deleted += size_gb
|
| 48 |
-
print(f"ποΈ Deleted {folder} ({size_gb:.2f} GB)")
|
| 49 |
os.makedirs("/tmp/hf_cache", exist_ok=True)
|
| 50 |
print(f"π§Ή Cache cleanup done. ~{total_deleted:.2f} GB removed.")
|
| 51 |
|
| 52 |
-
|
| 53 |
def check_disk_usage():
|
| 54 |
-
"""Display disk usage info in sidebar."""
|
| 55 |
st.sidebar.markdown("### πΎ Disk Usage (Debug)")
|
| 56 |
try:
|
| 57 |
usage = os.popen("du -sh /root/.cache /tmp 2>/dev/null").read()
|
|
@@ -59,13 +54,11 @@ def check_disk_usage():
|
|
| 59 |
except Exception as e:
|
| 60 |
st.sidebar.text(f"β οΈ Disk usage check failed: {e}")
|
| 61 |
|
| 62 |
-
|
| 63 |
-
# Run cache cleanup once at startup
|
| 64 |
clean_cache()
|
| 65 |
check_disk_usage()
|
| 66 |
|
| 67 |
# ==========================================================
|
| 68 |
-
# βοΈ
|
| 69 |
# ==========================================================
|
| 70 |
CACHE_DIR = "/tmp/hf_cache"
|
| 71 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
|
@@ -81,13 +74,12 @@ os.environ.update({
|
|
| 81 |
# ==========================================================
|
| 82 |
from ingestion import extract_text_from_pdf, chunk_text
|
| 83 |
from vectorstore import build_faiss_index
|
| 84 |
-
from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks
|
| 85 |
|
| 86 |
# ==========================================================
|
| 87 |
-
# π§ TOC
|
| 88 |
# ==========================================================
|
| 89 |
def clean_toc_titles(toc):
|
| 90 |
-
"""Removes section numbers and keeps only meaningful text."""
|
| 91 |
clean_titles = []
|
| 92 |
for _, title in toc:
|
| 93 |
title = re.sub(r"^\d+(\.\d+)*\s*", "", title)
|
|
@@ -98,14 +90,12 @@ def clean_toc_titles(toc):
|
|
| 98 |
|
| 99 |
|
| 100 |
def generate_query_suggestions(toc_titles):
|
| 101 |
-
"""Converts section titles into conversational question suggestions."""
|
| 102 |
suggestions = []
|
| 103 |
for t in toc_titles:
|
| 104 |
lower = t.lower()
|
| 105 |
-
|
| 106 |
if "prerequisite" in lower:
|
| 107 |
suggestions.append("What are the prerequisites for setting this up?")
|
| 108 |
-
elif "restriction" in lower
|
| 109 |
suggestions.append("What are the key restrictions or limitations?")
|
| 110 |
elif "configuration" in lower or "setup" in lower:
|
| 111 |
suggestions.append(f"How do I {t.lower()}?")
|
|
@@ -117,12 +107,8 @@ def generate_query_suggestions(toc_titles):
|
|
| 117 |
suggestions.append("Can you show an example from this document?")
|
| 118 |
elif "process" in lower:
|
| 119 |
suggestions.append(f"Can you explain the {t.lower()} process?")
|
| 120 |
-
elif "use" in lower:
|
| 121 |
-
suggestions.append(f"How do I {t.lower()}?")
|
| 122 |
else:
|
| 123 |
suggestions.append(f"Explain the section about {t.lower()}.")
|
| 124 |
-
|
| 125 |
-
# Deduplicate & limit
|
| 126 |
seen, final = set(), []
|
| 127 |
for s in suggestions:
|
| 128 |
if s not in seen:
|
|
@@ -131,38 +117,48 @@ def generate_query_suggestions(toc_titles):
|
|
| 131 |
return final[:6]
|
| 132 |
|
| 133 |
|
| 134 |
-
def
|
| 135 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 136 |
if not chunks:
|
| 137 |
return []
|
| 138 |
|
| 139 |
-
# Take
|
| 140 |
-
|
|
|
|
|
|
|
| 141 |
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
if "setup" in head_text or "configuration" in head_text:
|
| 146 |
-
suggestions.append("How do I configure or set this up?")
|
| 147 |
-
if "prerequisite" in head_text:
|
| 148 |
-
suggestions.append("What are the prerequisites before using this process?")
|
| 149 |
-
if "troubleshoot" in head_text or "error" in head_text:
|
| 150 |
-
suggestions.append("How do I troubleshoot common errors?")
|
| 151 |
-
if "step" in head_text or "procedure" in head_text:
|
| 152 |
-
suggestions.append("Can you list the steps involved in this process?")
|
| 153 |
-
if "benefit" in head_text or "objective" in head_text:
|
| 154 |
-
suggestions.append("What is the objective or benefit of this guide?")
|
| 155 |
-
|
| 156 |
-
# Fallback generic questions if no keywords found
|
| 157 |
-
if not suggestions:
|
| 158 |
-
suggestions = [
|
| 159 |
-
"Can you summarize the main topic of this document?",
|
| 160 |
-
"What process does this guide explain?",
|
| 161 |
-
"How can I get started with the described setup?",
|
| 162 |
-
"What are the important details to remember?",
|
| 163 |
-
]
|
| 164 |
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
|
| 168 |
# ==========================================================
|
|
@@ -191,21 +187,17 @@ with st.sidebar:
|
|
| 191 |
st.session_state.reasoning_mode = st.toggle(
|
| 192 |
"π§ Enable Reasoning Mode",
|
| 193 |
value=st.session_state.reasoning_mode,
|
| 194 |
-
help="When ON: GPT-4o uses reasoning +
|
| 195 |
)
|
| 196 |
|
| 197 |
st.markdown("---")
|
| 198 |
st.header("π Document Library")
|
| 199 |
-
doc_choice = st.radio(
|
| 200 |
-
"Choose a document:",
|
| 201 |
-
["-- Select --", "Sample PDF", "Upload Custom PDF"],
|
| 202 |
-
index=0
|
| 203 |
-
)
|
| 204 |
|
| 205 |
st.markdown("---")
|
| 206 |
st.header("βοΈ Settings")
|
| 207 |
-
chunk_size = st.slider("Chunk Size
|
| 208 |
-
overlap = st.slider("Chunk Overlap
|
| 209 |
top_k = st.slider("Top K Results", 1, 10, 5)
|
| 210 |
st.markdown("---")
|
| 211 |
st.caption("π¨βπ» Built by Shubham Sharma")
|
|
@@ -218,49 +210,21 @@ text, chunks, index, embeddings, toc = None, None, None, None, None
|
|
| 218 |
if doc_choice == "-- Select --":
|
| 219 |
st.info("β¬
οΈ Please choose a document from the sidebar.")
|
| 220 |
|
| 221 |
-
elif doc_choice
|
| 222 |
-
temp_path = SAMPLE_PATH
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
st.text_area("TOC Preview", toc_text, height=200)
|
| 234 |
-
|
| 235 |
-
clean_titles = clean_toc_titles(toc)
|
| 236 |
-
query_suggestions = generate_query_suggestions(clean_titles)
|
| 237 |
-
else:
|
| 238 |
-
st.warning("β οΈ No TOC detected β generating smart suggestions using content...")
|
| 239 |
-
query_suggestions = generate_ai_fallback_suggestions(chunks)
|
| 240 |
-
|
| 241 |
-
if query_suggestions:
|
| 242 |
-
st.markdown("#### π‘ Suggested Questions")
|
| 243 |
-
cols = st.columns(2)
|
| 244 |
-
for i, q in enumerate(query_suggestions):
|
| 245 |
-
if cols[i % 2].button(f"π {q}"):
|
| 246 |
-
st.session_state["user_query"] = q
|
| 247 |
-
|
| 248 |
-
with st.spinner("βοΈ Loading cached embeddings or generating new ones..."):
|
| 249 |
-
embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
|
| 250 |
-
index = build_faiss_index(embeddings)
|
| 251 |
-
|
| 252 |
-
elif doc_choice == "Upload Custom PDF":
|
| 253 |
-
uploaded_file = st.file_uploader("π Upload your PDF", type="pdf")
|
| 254 |
-
if uploaded_file:
|
| 255 |
-
temp_path = os.path.join("/tmp", uploaded_file.name)
|
| 256 |
-
with open(temp_path, "wb") as f:
|
| 257 |
-
f.write(uploaded_file.getbuffer())
|
| 258 |
-
st.success(f"β
File '{uploaded_file.name}' uploaded successfully")
|
| 259 |
-
|
| 260 |
-
with st.spinner("βοΈ Extracting and processing your document..."):
|
| 261 |
text, toc = extract_text_from_pdf(temp_path)
|
| 262 |
chunks = chunk_text(text, chunk_size=chunk_size)
|
| 263 |
-
st.write(f"
|
| 264 |
|
| 265 |
if toc:
|
| 266 |
st.markdown("### π§ Detected Table of Contents")
|
|
@@ -270,8 +234,8 @@ elif doc_choice == "Upload Custom PDF":
|
|
| 270 |
clean_titles = clean_toc_titles(toc)
|
| 271 |
query_suggestions = generate_query_suggestions(clean_titles)
|
| 272 |
else:
|
| 273 |
-
st.warning("β οΈ No TOC detected β generating
|
| 274 |
-
query_suggestions =
|
| 275 |
|
| 276 |
if query_suggestions:
|
| 277 |
st.markdown("#### π‘ Suggested Questions")
|
|
@@ -325,6 +289,5 @@ if index and chunks:
|
|
| 325 |
""",
|
| 326 |
unsafe_allow_html=True,
|
| 327 |
)
|
| 328 |
-
|
| 329 |
else:
|
| 330 |
st.info("π₯ Upload or select a document to start exploring.")
|
|
|
|
| 24 |
)
|
| 25 |
|
| 26 |
# ==========================================================
|
| 27 |
+
# π§Ή Cache Management
|
| 28 |
# ==========================================================
|
| 29 |
def clean_cache(max_size_gb: float = 2.0):
|
|
|
|
| 30 |
folders = [
|
| 31 |
"/root/.cache/huggingface",
|
| 32 |
"/root/.cache/transformers",
|
| 33 |
"/root/.cache/torch",
|
| 34 |
]
|
| 35 |
total_deleted = 0.0
|
|
|
|
| 36 |
for folder in folders:
|
| 37 |
if os.path.exists(folder):
|
| 38 |
size_gb = sum(
|
|
|
|
| 43 |
if size_gb > max_size_gb or "torch" in folder:
|
| 44 |
shutil.rmtree(folder, ignore_errors=True)
|
| 45 |
total_deleted += size_gb
|
|
|
|
| 46 |
os.makedirs("/tmp/hf_cache", exist_ok=True)
|
| 47 |
print(f"π§Ή Cache cleanup done. ~{total_deleted:.2f} GB removed.")
|
| 48 |
|
|
|
|
| 49 |
def check_disk_usage():
|
|
|
|
| 50 |
st.sidebar.markdown("### πΎ Disk Usage (Debug)")
|
| 51 |
try:
|
| 52 |
usage = os.popen("du -sh /root/.cache /tmp 2>/dev/null").read()
|
|
|
|
| 54 |
except Exception as e:
|
| 55 |
st.sidebar.text(f"β οΈ Disk usage check failed: {e}")
|
| 56 |
|
|
|
|
|
|
|
| 57 |
clean_cache()
|
| 58 |
check_disk_usage()
|
| 59 |
|
| 60 |
# ==========================================================
|
| 61 |
+
# βοΈ HF Cache Configuration
|
| 62 |
# ==========================================================
|
| 63 |
CACHE_DIR = "/tmp/hf_cache"
|
| 64 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
|
|
|
| 74 |
# ==========================================================
|
| 75 |
from ingestion import extract_text_from_pdf, chunk_text
|
| 76 |
from vectorstore import build_faiss_index
|
| 77 |
+
from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate # add genai_generate!
|
| 78 |
|
| 79 |
# ==========================================================
|
| 80 |
+
# π§ TOC & Dynamic AI Suggestion System
|
| 81 |
# ==========================================================
|
| 82 |
def clean_toc_titles(toc):
|
|
|
|
| 83 |
clean_titles = []
|
| 84 |
for _, title in toc:
|
| 85 |
title = re.sub(r"^\d+(\.\d+)*\s*", "", title)
|
|
|
|
| 90 |
|
| 91 |
|
| 92 |
def generate_query_suggestions(toc_titles):
|
|
|
|
| 93 |
suggestions = []
|
| 94 |
for t in toc_titles:
|
| 95 |
lower = t.lower()
|
|
|
|
| 96 |
if "prerequisite" in lower:
|
| 97 |
suggestions.append("What are the prerequisites for setting this up?")
|
| 98 |
+
elif "restriction" in lower:
|
| 99 |
suggestions.append("What are the key restrictions or limitations?")
|
| 100 |
elif "configuration" in lower or "setup" in lower:
|
| 101 |
suggestions.append(f"How do I {t.lower()}?")
|
|
|
|
| 107 |
suggestions.append("Can you show an example from this document?")
|
| 108 |
elif "process" in lower:
|
| 109 |
suggestions.append(f"Can you explain the {t.lower()} process?")
|
|
|
|
|
|
|
| 110 |
else:
|
| 111 |
suggestions.append(f"Explain the section about {t.lower()}.")
|
|
|
|
|
|
|
| 112 |
seen, final = set(), []
|
| 113 |
for s in suggestions:
|
| 114 |
if s not in seen:
|
|
|
|
| 117 |
return final[:6]
|
| 118 |
|
| 119 |
|
| 120 |
+
def generate_ai_dynamic_suggestions(chunks, doc_name="Document"):
|
| 121 |
+
"""
|
| 122 |
+
π€ Uses GPT-4o via SAP GenAI Hub to analyze first few chunks
|
| 123 |
+
and generate dynamic, context-aware question suggestions.
|
| 124 |
+
"""
|
| 125 |
if not chunks:
|
| 126 |
return []
|
| 127 |
|
| 128 |
+
# Take top 3 chunks as context
|
| 129 |
+
sample_text = " ".join(chunks[:3])[:3000]
|
| 130 |
+
prompt = f"""
|
| 131 |
+
You are an intelligent assistant helping users explore enterprise documentation titled '{doc_name}'.
|
| 132 |
|
| 133 |
+
Based on the content below, generate 5 short, interactive, human-like questions
|
| 134 |
+
that a curious user might ask to understand this document better.
|
| 135 |
+
Avoid section numbers, and sound conversational.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
+
---
|
| 138 |
+
Content Sample:
|
| 139 |
+
{sample_text}
|
| 140 |
+
---
|
| 141 |
+
Questions:
|
| 142 |
+
"""
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
ai_response = genai_generate(prompt) # Uses your existing GPT-4o connector
|
| 146 |
+
questions = re.findall(r"[-β’]?\s*(.+)", ai_response)
|
| 147 |
+
clean_q = [q.strip("β’-β ").strip() for q in questions if 8 < len(q) < 120]
|
| 148 |
+
clean_q = [q for q in clean_q if q.endswith("?")]
|
| 149 |
+
return clean_q[:6] if clean_q else [
|
| 150 |
+
"What is this document about?",
|
| 151 |
+
"How do I start using the process described here?",
|
| 152 |
+
"What key setup steps are involved?",
|
| 153 |
+
"What benefits or objectives are explained?",
|
| 154 |
+
]
|
| 155 |
+
except Exception as e:
|
| 156 |
+
print(f"β οΈ AI suggestion generation failed: {e}")
|
| 157 |
+
return [
|
| 158 |
+
"Can you summarize the document?",
|
| 159 |
+
"What is the main idea here?",
|
| 160 |
+
"How does this guide help me?",
|
| 161 |
+
]
|
| 162 |
|
| 163 |
|
| 164 |
# ==========================================================
|
|
|
|
| 187 |
st.session_state.reasoning_mode = st.toggle(
|
| 188 |
"π§ Enable Reasoning Mode",
|
| 189 |
value=st.session_state.reasoning_mode,
|
| 190 |
+
help="When ON: GPT-4o uses reasoning + synthesis.\nWhen OFF: strictly factual."
|
| 191 |
)
|
| 192 |
|
| 193 |
st.markdown("---")
|
| 194 |
st.header("π Document Library")
|
| 195 |
+
doc_choice = st.radio("Choose a document:", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=0)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
st.markdown("---")
|
| 198 |
st.header("βοΈ Settings")
|
| 199 |
+
chunk_size = st.slider("Chunk Size", 200, 1500, 800, step=50)
|
| 200 |
+
overlap = st.slider("Chunk Overlap", 50, 200, 120, step=10)
|
| 201 |
top_k = st.slider("Top K Results", 1, 10, 5)
|
| 202 |
st.markdown("---")
|
| 203 |
st.caption("π¨βπ» Built by Shubham Sharma")
|
|
|
|
| 210 |
if doc_choice == "-- Select --":
|
| 211 |
st.info("β¬
οΈ Please choose a document from the sidebar.")
|
| 212 |
|
| 213 |
+
elif doc_choice in ["Sample PDF", "Upload Custom PDF"]:
|
| 214 |
+
temp_path = SAMPLE_PATH if doc_choice == "Sample PDF" else None
|
| 215 |
+
if doc_choice == "Upload Custom PDF":
|
| 216 |
+
uploaded_file = st.file_uploader("π Upload your PDF", type="pdf")
|
| 217 |
+
if uploaded_file:
|
| 218 |
+
temp_path = os.path.join("/tmp", uploaded_file.name)
|
| 219 |
+
with open(temp_path, "wb") as f:
|
| 220 |
+
f.write(uploaded_file.getbuffer())
|
| 221 |
+
st.success(f"β
File '{uploaded_file.name}' uploaded successfully")
|
| 222 |
+
|
| 223 |
+
if temp_path:
|
| 224 |
+
with st.spinner("π Extracting and processing document..."):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
text, toc = extract_text_from_pdf(temp_path)
|
| 226 |
chunks = chunk_text(text, chunk_size=chunk_size)
|
| 227 |
+
st.write(f"π Extracted {len(chunks)} chunks.")
|
| 228 |
|
| 229 |
if toc:
|
| 230 |
st.markdown("### π§ Detected Table of Contents")
|
|
|
|
| 234 |
clean_titles = clean_toc_titles(toc)
|
| 235 |
query_suggestions = generate_query_suggestions(clean_titles)
|
| 236 |
else:
|
| 237 |
+
st.warning("β οΈ No TOC detected β generating dynamic suggestions using AI...")
|
| 238 |
+
query_suggestions = generate_ai_dynamic_suggestions(chunks, doc_name=os.path.basename(temp_path))
|
| 239 |
|
| 240 |
if query_suggestions:
|
| 241 |
st.markdown("#### π‘ Suggested Questions")
|
|
|
|
| 289 |
""",
|
| 290 |
unsafe_allow_html=True,
|
| 291 |
)
|
|
|
|
| 292 |
else:
|
| 293 |
st.info("π₯ Upload or select a document to start exploring.")
|