Shubham170793 commited on
Commit
71328cc
Β·
verified Β·
1 Parent(s): fb3091e

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +124 -142
src/streamlit_app.py CHANGED
@@ -5,46 +5,13 @@ import streamlit as st
5
  import torch
6
 
7
  # ==========================================================
8
- # βœ… Environment Diagnostics
9
- # ==========================================================
10
- print("CUDA available:", torch.cuda.is_available())
11
- if torch.cuda.is_available():
12
- print("GPU:", torch.cuda.get_device_name(0))
13
- else:
14
- print("Running on CPU")
15
-
16
- # ==========================================================
17
- # βœ… Page Configuration
18
  # ==========================================================
19
  st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
 
20
 
21
  # ==========================================================
22
- # 🧹 Cache Management
23
- # ==========================================================
24
- def clean_cache(max_size_gb: float = 2.0):
25
- folders = [
26
- "/root/.cache/huggingface",
27
- "/root/.cache/transformers",
28
- "/root/.cache/torch",
29
- ]
30
- total_deleted = 0.0
31
- for folder in folders:
32
- if os.path.exists(folder):
33
- size_gb = sum(
34
- os.path.getsize(os.path.join(dp, f))
35
- for dp, _, files in os.walk(folder)
36
- for f in files
37
- ) / (1024**3)
38
- if size_gb > max_size_gb or "torch" in folder:
39
- shutil.rmtree(folder, ignore_errors=True)
40
- total_deleted += size_gb
41
- os.makedirs("/tmp/hf_cache", exist_ok=True)
42
- print(f"🧹 Cache cleanup done. Removed ~{total_deleted:.2f} GB.")
43
-
44
- clean_cache()
45
-
46
- # ==========================================================
47
- # βš™οΈ Hugging Face Cache Configuration
48
  # ==========================================================
49
  CACHE_DIR = "/tmp/hf_cache"
50
  os.makedirs(CACHE_DIR, exist_ok=True)
@@ -56,37 +23,40 @@ os.environ.update({
56
  })
57
 
58
  # ==========================================================
59
- # πŸ“¦ Imports AFTER Environment Setup
60
  # ==========================================================
61
  from ingestion import extract_text_from_pdf, chunk_text
62
  from vectorstore import build_faiss_index
63
  from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
64
 
65
  # ==========================================================
66
- # 🧠 Smart Suggestion Generator (Based on TOC + Content)
67
  # ==========================================================
68
  def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
69
- if not toc and not chunks:
 
70
  return []
71
 
72
  titles = []
73
- for sec, raw_title in toc or []:
74
  title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
75
  title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
76
  if 4 < len(title) < 120:
77
  titles.append(title)
78
 
79
- context_sample = " ".join(chunks[:3])[:3000]
80
  prompt = f"""
81
- You are helping a user explore a document titled "{doc_name}".
 
 
82
  TABLE OF CONTENTS:
83
- {chr(10).join(['- ' + t for t in titles[:10]])}
84
 
85
- CONTENT SAMPLE:
86
  {context_sample}
87
 
88
- Generate 5–7 short, specific professional questions.
89
- Each should be under 20 words, well-phrased, and relevant to the document.
90
  """
91
 
92
  try:
@@ -100,17 +70,66 @@ def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
100
  final.append(q)
101
  return final[:7]
102
  except Exception:
103
- return [
104
- "What is this document about?",
105
- "How do I configure this process?",
106
- "What key steps are described?",
107
- ]
108
-
109
- # ==========================================================
110
- # πŸ–₯️ Header
111
- # ==========================================================
112
- st.title("πŸ“„ Enterprise Knowledge Assistant")
113
- st.caption("Query SAP documentation and enterprise PDFs β€” powered by reasoning and retrieval.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  # ==========================================================
116
  # 🧭 Sidebar
@@ -118,49 +137,40 @@ st.caption("Query SAP documentation and enterprise PDFs β€” powered by reasoning
118
  with st.sidebar:
119
  if "reasoning_mode" not in st.session_state:
120
  st.session_state.reasoning_mode = False
121
- st.session_state.reasoning_mode = st.toggle(
122
- "🧠 Enable Reasoning Mode",
123
- value=st.session_state.reasoning_mode,
124
- help="ON = detailed reasoning Β· OFF = concise factual answers"
125
- )
126
 
127
- st.markdown("---")
128
- st.header("πŸ“š Document Library")
129
- doc_choice = st.radio("Choose a document:", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=1)
130
 
131
- st.markdown("---")
132
- st.header("βš™οΈ Settings")
133
- chunk_size = st.slider("Chunk Size (characters)", 200, 1500, 1000, step=50)
134
- overlap = st.slider("Chunk Overlap (characters)", 50, 200, 120, step=10)
135
  top_k = st.slider("Top K Results", 1, 10, 5)
136
  st.markdown("---")
137
  st.caption("✨ Built by Shubham Sharma")
138
 
139
  # ==========================================================
140
- # 🧾 Document Handling
141
  # ==========================================================
 
 
 
142
  text, chunks, index, embeddings, toc = None, None, None, None, None
143
 
144
- # --- Initialize Session State ---
145
- for key, default in {
146
- "show_more": False,
147
- "user_query_input": "",
148
- "selected_suggestion": None,
149
- }.items():
150
- if key not in st.session_state:
151
- st.session_state[key] = default
152
 
153
- # ==========================================================
154
- # πŸ“„ Main Flow
155
- # ==========================================================
156
- BASE_DIR = os.path.dirname(__file__)
157
- SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
158
 
159
  if doc_choice == "-- Select --":
160
- st.info("⬅️ Please choose a document to begin.")
161
  else:
162
  if doc_choice == "Sample PDF":
163
- temp_path = SAMPLE_PATH
164
  st.success("πŸ“˜ Using built-in Sample PDF.")
165
  else:
166
  uploaded_file = st.file_uploader("πŸ“‚ Upload your PDF", type="pdf")
@@ -172,88 +182,60 @@ else:
172
  else:
173
  temp_path = None
174
 
175
- # ------------------------------
176
- # Document Processing
177
- # ------------------------------
178
  if temp_path:
179
  with st.spinner("πŸ” Processing your document..."):
180
  text, toc = extract_text_from_pdf(temp_path)
181
  chunks = chunk_text(text, chunk_size=chunk_size)
182
- st.markdown("βœ… Document loaded successfully.")
183
-
184
- query_suggestions = generate_dynamic_suggestions_from_toc(
185
- toc, chunks, os.path.basename(temp_path)
186
- )
187
 
188
- with st.spinner("βš™οΈ Preparing embeddings..."):
189
  embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
190
  index = build_faiss_index(embeddings)
191
- st.markdown("πŸš€ Document ready β€” you can now ask questions below.")
192
 
193
- # ==========================================================
194
  # πŸ’¬ Ask a Question
195
- # ==========================================================
196
- st.markdown("## πŸ€– Ask a Question")
197
- if query_suggestions:
198
- st.markdown("#### πŸ’‘ Suggested Questions")
199
 
 
200
  visible = query_suggestions if st.session_state.show_more else query_suggestions[:3]
201
  cols = st.columns(min(3, len(visible)))
 
202
  for i, q in enumerate(visible):
203
- col = cols[i % 3]
204
- if col.button(f"πŸ” {q}", key=f"q_{i}"):
205
- st.session_state.selected_suggestion = i
206
- st.session_state.user_query_input = q
207
 
208
  toggle_text = "Show less β–²" if st.session_state.show_more else "Show more β–Ό"
209
  if st.button(toggle_text):
210
  st.session_state.show_more = not st.session_state.show_more
211
  st.experimental_rerun()
212
 
213
- # ----------------------------------------------------------
214
- # 🧠 Input Field
215
- # ----------------------------------------------------------
216
- user_query = st.text_input(
217
- "Type your question or pick one above:",
218
- value=st.session_state.user_query_input,
219
- key="user_query_input"
220
- )
221
 
222
- # ----------------------------------------------------------
223
- # 🧩 Generate Answer
224
- # ----------------------------------------------------------
225
  if user_query.strip():
226
- st.caption(
227
- "Mode: 🧠 Reasoning"
228
- if st.session_state.reasoning_mode
229
- else "Mode: πŸ“„ Strict Document"
230
- )
231
- with st.spinner("πŸ’­ Analyzing your document..."):
232
  retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
233
- answer = generate_answer(
234
- user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode
235
- )
236
-
237
- # Assistant Answer
238
- st.markdown("### βœ… Assistant’s Answer")
239
- st.markdown(
240
- f"<div style='background-color:#111827;padding:14px;border-radius:8px;color:#f1f5f9;'>"
241
- f"πŸ’‘ {answer}</div>",
242
- unsafe_allow_html=True,
243
- )
244
 
245
  with st.expander("πŸ“„ Supporting Context"):
246
  for i, r in enumerate(retrieved, start=1):
247
- st.markdown(
248
- f"<div style='background-color:#1f2937;padding:10px;border-radius:6px;color:#e5e7eb;margin-bottom:8px;'>"
249
- f"<b>Chunk {i}:</b> {r}</div>",
250
- unsafe_allow_html=True,
251
- )
 
 
 
 
252
 
253
  # ----------------------------------------------------------
254
- # πŸ“‘ Document Preview
255
  # ----------------------------------------------------------
256
- st.markdown("---")
257
- st.subheader("πŸ“‘ Document Preview")
258
- st.text_area("Extracted text (first 1000 chars)", text[:1000], height=200)
259
- st.caption(f"πŸ“¦ {len(chunks)} chunks processed.")
 
5
  import torch
6
 
7
  # ==========================================================
8
+ # βœ… Environment Setup
 
 
 
 
 
 
 
 
 
9
  # ==========================================================
10
  st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
11
+ print("CUDA available:", torch.cuda.is_available())
12
 
13
  # ==========================================================
14
+ # βš™οΈ Hugging Face Cache Setup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # ==========================================================
16
  CACHE_DIR = "/tmp/hf_cache"
17
  os.makedirs(CACHE_DIR, exist_ok=True)
 
23
  })
24
 
25
  # ==========================================================
26
+ # πŸ“¦ Imports
27
  # ==========================================================
28
  from ingestion import extract_text_from_pdf, chunk_text
29
  from vectorstore import build_faiss_index
30
  from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
31
 
32
  # ==========================================================
33
+ # 🧠 Smart Suggestion Generator
34
  # ==========================================================
35
  def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
36
+ """Generate clean, context-aware questions dynamically from TOC and text."""
37
+ if not toc or not chunks:
38
  return []
39
 
40
  titles = []
41
+ for sec, raw_title in toc:
42
  title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
43
  title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
44
  if 4 < len(title) < 120:
45
  titles.append(title)
46
 
47
+ context_sample = " ".join(chunks[:3])[:4000]
48
  prompt = f"""
49
+ You are generating concise, context-aware questions based on the document "{doc_name}".
50
+ Use this Table of Contents and sample content for inspiration.
51
+
52
  TABLE OF CONTENTS:
53
+ {chr(10).join(['- ' + t for t in titles[:8]])}
54
 
55
+ TEXT SAMPLE:
56
  {context_sample}
57
 
58
+ Generate 5–7 questions that are short, relevant, and strictly document-based.
59
+ Each question should be under 18 words.
60
  """
61
 
62
  try:
 
70
  final.append(q)
71
  return final[:7]
72
  except Exception:
73
+ return ["What is this document about?", "How do I start using this process?"]
74
+
75
+ # ==========================================================
76
+ # 🎨 Styling β€” Customer-Ready Minimal Theme
77
+ # ==========================================================
78
+ st.markdown("""
79
+ <style>
80
+ div.block-container {
81
+ padding-top: 1.5rem;
82
+ max-width: 1050px;
83
+ }
84
+ h1, h2, h3, h4 {
85
+ font-weight: 600;
86
+ color: #f3f4f6;
87
+ }
88
+ hr {
89
+ border: none;
90
+ border-top: 1px solid #2c2c2c;
91
+ margin: 1rem 0;
92
+ }
93
+ .suggest-chip {
94
+ background-color: #1f2937;
95
+ border: 1px solid #374151;
96
+ border-radius: 16px;
97
+ color: #e5e7eb;
98
+ padding: 6px 12px;
99
+ cursor: pointer;
100
+ font-size: 13px;
101
+ transition: all 0.2s ease-in-out;
102
+ }
103
+ .suggest-chip:hover {
104
+ background-color: #2563eb;
105
+ border-color: #3b82f6;
106
+ color: white;
107
+ box-shadow: 0 0 8px rgba(59,130,246,0.4);
108
+ }
109
+ .answer-box {
110
+ background: linear-gradient(135deg, #0f172a, #1e293b);
111
+ border-left: 4px solid #3b82f6;
112
+ border-radius: 8px;
113
+ padding: 14px 16px;
114
+ color: #f1f5f9;
115
+ margin-top: 1rem;
116
+ box-shadow: 0 0 10px rgba(59,130,246,0.1);
117
+ }
118
+ .stTextInput > div > div > input {
119
+ background-color: #0f172a;
120
+ color: #f1f5f9;
121
+ border-radius: 6px;
122
+ border: 1px solid #334155;
123
+ padding: 6px 10px;
124
+ }
125
+ .stTextArea > div > div > textarea {
126
+ background-color: #0f172a;
127
+ color: #f1f5f9;
128
+ border-radius: 6px;
129
+ border: 1px solid #334155;
130
+ }
131
+ </style>
132
+ """, unsafe_allow_html=True)
133
 
134
  # ==========================================================
135
  # 🧭 Sidebar
 
137
  with st.sidebar:
138
  if "reasoning_mode" not in st.session_state:
139
  st.session_state.reasoning_mode = False
 
 
 
 
 
140
 
141
+ st.markdown("### βš™οΈ Settings")
142
+ reasoning_mode = st.toggle("🧠 Enable Reasoning Mode", st.session_state.reasoning_mode)
143
+ st.session_state.reasoning_mode = reasoning_mode
144
 
145
+ chunk_size = st.slider("Chunk Size", 200, 1500, 1000, step=50)
146
+ overlap = st.slider("Chunk Overlap", 50, 200, 120, step=10)
 
 
147
  top_k = st.slider("Top K Results", 1, 10, 5)
148
  st.markdown("---")
149
  st.caption("✨ Built by Shubham Sharma")
150
 
151
  # ==========================================================
152
+ # πŸ“„ Main Flow
153
  # ==========================================================
154
+ st.title("Enterprise Knowledge Assistant")
155
+ st.caption("Query SAP documentation and enterprise PDFs β€” powered by reasoning and retrieval.")
156
+
157
  text, chunks, index, embeddings, toc = None, None, None, None, None
158
 
159
+ if "user_query_input" not in st.session_state:
160
+ st.session_state["user_query_input"] = ""
161
+ if "show_more" not in st.session_state:
162
+ st.session_state["show_more"] = False
 
 
 
 
163
 
164
+ def set_user_query(q):
165
+ st.session_state["user_query_input"] = q
166
+
167
+ doc_choice = st.radio("", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=1)
 
168
 
169
  if doc_choice == "-- Select --":
170
+ st.info("⬅️ Select a document to begin.")
171
  else:
172
  if doc_choice == "Sample PDF":
173
+ temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
174
  st.success("πŸ“˜ Using built-in Sample PDF.")
175
  else:
176
  uploaded_file = st.file_uploader("πŸ“‚ Upload your PDF", type="pdf")
 
182
  else:
183
  temp_path = None
184
 
 
 
 
185
  if temp_path:
186
  with st.spinner("πŸ” Processing your document..."):
187
  text, toc = extract_text_from_pdf(temp_path)
188
  chunks = chunk_text(text, chunk_size=chunk_size)
189
+ query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, os.path.basename(temp_path))
 
 
 
 
190
 
191
+ with st.spinner("βš™οΈ Preparing search index..."):
192
  embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
193
  index = build_faiss_index(embeddings)
194
+ st.success("πŸš€ Document ready.")
195
 
196
+ # ----------------------------------------------------------
197
  # πŸ’¬ Ask a Question
198
+ # ----------------------------------------------------------
199
+ st.markdown("### Ask a Question")
 
 
200
 
201
+ if query_suggestions:
202
  visible = query_suggestions if st.session_state.show_more else query_suggestions[:3]
203
  cols = st.columns(min(3, len(visible)))
204
+
205
  for i, q in enumerate(visible):
206
+ cols[i % 3].button(f"πŸ” {q}", key=f"suggest_{i}", on_click=set_user_query, args=(q,))
 
 
 
207
 
208
  toggle_text = "Show less β–²" if st.session_state.show_more else "Show more β–Ό"
209
  if st.button(toggle_text):
210
  st.session_state.show_more = not st.session_state.show_more
211
  st.experimental_rerun()
212
 
213
+ user_query = st.text_input("Type your question or pick one above:", key="user_query_input")
 
 
 
 
 
 
 
214
 
 
 
 
215
  if user_query.strip():
216
+ with st.spinner("πŸ’­ Analyzing document..."):
 
 
 
 
 
217
  retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
218
+ answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode)
219
+
220
+ st.markdown("### Assistant’s Answer")
221
+ st.markdown(f"<div class='answer-box'>πŸ’‘ {answer}</div>", unsafe_allow_html=True)
 
 
 
 
 
 
 
222
 
223
  with st.expander("πŸ“„ Supporting Context"):
224
  for i, r in enumerate(retrieved, start=1):
225
+ st.markdown(f"**Chunk {i}:** {r}")
226
+
227
+ # ----------------------------------------------------------
228
+ # πŸ“š Table of Contents
229
+ # ----------------------------------------------------------
230
+ if toc:
231
+ with st.expander("πŸ“š Table of Contents"):
232
+ toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc])
233
+ st.text_area("", toc_text, height=150)
234
 
235
  # ----------------------------------------------------------
236
+ # πŸ“„ Document Preview
237
  # ----------------------------------------------------------
238
+ if chunks:
239
+ with st.expander("πŸ“„ Document Preview"):
240
+ st.text_area("", text[:1000], height=150)
241
+ st.caption(f"{len(chunks)} chunks processed.")