Starberry15 commited on
Commit
f521fb7
Β·
verified Β·
1 Parent(s): 95d0828

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +107 -121
src/streamlit_app.py CHANGED
@@ -1,9 +1,13 @@
 
 
 
 
 
1
  import os
2
- import time
3
  import glob
4
  import json
 
5
  from typing import List, Dict, Any
6
-
7
  import numpy as np
8
  import streamlit as st
9
  import PyPDF2
@@ -12,24 +16,30 @@ from dotenv import load_dotenv
12
  from huggingface_hub import InferenceClient, login
13
  from streamlit_chat import message as st_message
14
 
15
- # Try importing FAISS
16
  try:
17
  import faiss
18
  except ImportError:
19
  faiss = None
20
 
21
  # =============================================================
22
- # 🌐 Environment & Page Setup
 
 
 
 
 
 
23
  # =============================================================
24
  st.set_page_config(page_title="πŸ“˜ Handbook Assistant", page_icon="πŸ“˜", layout="wide")
25
  st.title("πŸ“˜ USTP Student Handbook Assistant (2023 Edition)")
26
- st.caption("This assistant references only the *USTP Student Handbook 2023 Edition.pdf* located in the same folder.")
27
 
28
  load_dotenv()
29
  HF_TOKEN = os.getenv("HF_TOKEN")
30
 
31
  if not HF_TOKEN:
32
- st.warning("⚠️ HF_TOKEN not found in .env file. Hugging Face API calls will fail.")
33
  else:
34
  try:
35
  login(HF_TOKEN)
@@ -39,48 +49,58 @@ else:
39
  hf_client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else None
40
 
41
  # =============================================================
42
- # βš™οΈ Configuration
43
  # =============================================================
44
- DEFAULT_MODEL = "mistralai/Mistral-7B-Instruct-v0.3" # strong, open, accurate
45
- EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
 
 
 
47
  INDEX_FILE = "handbook_faiss.index"
48
  META_FILE = "handbook_metadata.json"
49
  EMB_DIM_FILE = "handbook_emb_dim.json"
50
-
51
- with st.sidebar:
52
- st.header("βš™οΈ Settings")
53
- similarity_threshold = st.slider("Similarity Threshold", 0.3, 1.0, 0.62, 0.01)
54
- top_k = st.slider("Top K Results", 1, 10, 4)
55
- chunk_size_chars = st.number_input("Chunk Size (chars)", 400, 2500, 1200, 100)
56
- chunk_overlap = st.number_input("Chunk Overlap (chars)", 20, 600, 150, 10)
57
- regenerate_index = st.button("πŸ” Rebuild Handbook Index")
58
 
59
  # =============================================================
60
  # 🧩 Utility Functions
61
  # =============================================================
62
-
63
  def find_handbook() -> List[str]:
64
- """Locate the handbook PDF in the same folder."""
65
  preferred = "USTP Student Handbook 2023 Edition.pdf"
66
- current_dir = os.path.dirname(os.path.abspath(__file__))
67
- pdf_path = os.path.join(current_dir, preferred)
68
-
69
- if os.path.exists(pdf_path):
70
- st.info(f"πŸ“˜ Found handbook: {preferred}")
71
- return [pdf_path]
72
-
73
- pdfs = glob.glob(os.path.join(current_dir, "*.pdf"))
74
  if pdfs:
75
- st.warning(f"⚠️ Preferred handbook not found. Using {os.path.basename(pdfs[0])}")
76
  return [pdfs[0]]
77
-
78
- st.error("❌ No PDF found in the same folder as this app.")
79
  return []
80
 
81
 
82
  def load_pdf_texts(pdf_paths: List[str]) -> List[Dict[str, Any]]:
83
- """Extract text from all pages of provided PDFs."""
84
  pages = []
85
  for path in pdf_paths:
86
  with open(path, "rb") as f:
@@ -88,12 +108,20 @@ def load_pdf_texts(pdf_paths: List[str]) -> List[Dict[str, Any]]:
88
  for i, page in enumerate(reader.pages):
89
  text = page.extract_text() or ""
90
  if text.strip():
91
- pages.append({"filename": os.path.basename(path), "page": i + 1, "text": text})
 
 
 
 
 
 
 
 
 
92
  return pages
93
 
94
 
95
  def chunk_text(pages: List[Dict[str, Any]], size: int, overlap: int) -> List[Dict[str, Any]]:
96
- """Split text into overlapping chunks."""
97
  chunks = []
98
  for p in pages:
99
  text = p["text"]
@@ -110,73 +138,50 @@ def chunk_text(pages: List[Dict[str, Any]], size: int, overlap: int) -> List[Dic
110
  return chunks
111
 
112
 
113
- # βœ… FIXED SECTION
114
  def embed_texts(texts: List[str]) -> np.ndarray:
115
- """Get embeddings via Hugging Face Inference API with proper fallback."""
116
- if not HF_TOKEN:
117
- st.error("❌ Missing HF_TOKEN.")
118
  return np.zeros((len(texts), 768))
119
-
120
- # --- Primary method ---
121
  try:
122
- embeddings = hf_client.feature_extraction(
123
- texts, # βœ… positional, not keyword
124
- model=EMBED_MODEL
125
- )
126
-
127
- # Handle token-level embedding cases
128
  if isinstance(embeddings[0][0], list):
129
  embeddings = [np.mean(np.array(e), axis=0) for e in embeddings]
130
-
131
  return np.array(embeddings)
132
-
133
- # --- Fallback method ---
134
  except Exception as e1:
135
- st.warning(f"⚠️ feature_extraction() failed, using REST API fallback: {e1}")
136
- try:
137
- headers = {"Authorization": f"Bearer {HF_TOKEN}"}
138
- response = requests.post(
139
- f"https://api-inference.huggingface.co/models/{EMBED_MODEL}", # βœ… correct endpoint
140
- headers=headers,
141
- json={"inputs": texts}
142
- )
143
- response.raise_for_status()
144
- data = response.json()
145
-
146
- # Handle nested outputs
147
- if isinstance(data[0][0], list):
148
- embeddings = [np.mean(np.array(e), axis=0) for e in data]
149
- else:
150
- embeddings = [np.array(data)]
151
-
152
- return np.array(embeddings)
153
- except Exception as e2:
154
- st.error(f"Embedding error: {e2}")
155
- return np.zeros((len(texts), 768))
156
 
157
 
158
- def build_faiss_index(chunks: List[Dict[str, Any]]) -> None:
159
- """Build and save FAISS index for handbook chunks."""
160
  texts = [c["content"] for c in chunks]
161
  embeddings = embed_texts(texts)
162
  if embeddings.size == 0:
163
- st.error("Embedding generation failed; cannot build index.")
164
  return
165
-
166
  dim = embeddings.shape[1]
167
  index = faiss.IndexFlatL2(dim)
168
  index.add(embeddings.astype("float32"))
169
-
170
  faiss.write_index(index, INDEX_FILE)
171
  with open(META_FILE, "w") as f:
172
  json.dump(chunks, f)
173
  with open(EMB_DIM_FILE, "w") as f:
174
  json.dump({"dim": dim}, f)
 
175
 
176
 
177
  def load_faiss_index():
178
- """Load FAISS index and metadata if available."""
179
- if not (os.path.exists(INDEX_FILE) and os.path.exists(META_FILE)):
180
  return None, None
181
  index = faiss.read_index(INDEX_FILE)
182
  with open(META_FILE) as f:
@@ -184,94 +189,75 @@ def load_faiss_index():
184
  return index, meta
185
 
186
 
187
- def search_index(query: str, index, meta, top_k: int, threshold: float) -> List[Dict[str, Any]]:
188
- """Search FAISS for top-K similar chunks."""
189
  query_emb = embed_texts([query])
190
  distances, indices = index.search(query_emb.astype("float32"), top_k)
191
  results = []
192
  for i, dist in zip(indices[0], distances[0]):
193
  if i < len(meta):
194
- result = meta[i]
195
- result["distance"] = float(dist)
196
- results.append(result)
197
  return results
198
 
199
 
200
  def generate_answer(context: str, query: str) -> str:
201
- """Generate robust answer with explicit citations β€” auto-switches between endpoints."""
202
  prompt = f"""
203
- You are a precise academic assistant specialized in university policies.
204
- Use only the provided *USTP Student Handbook 2023 Edition* content as reference.
205
- If the answer is not explicitly found, respond with:
206
  "The handbook does not specify that."
207
 
208
  ---
209
- πŸ“˜ **Context (from the handbook)**:
210
  {context}
211
  ---
212
- 🧭 **Question**:
213
  {query}
214
  ---
215
- 🎯 **Instructions**:
216
- - Answer concisely and factually.
217
- - Include page numbers and filename references where relevant.
218
- - Do NOT invent or assume any information not in the handbook.
219
  """
220
 
221
- if not hf_client:
222
- return "❌ Hugging Face client not initialized."
223
-
224
- # Try standard text-generation first
225
  try:
226
  response = hf_client.text_generation(
227
  model=DEFAULT_MODEL,
228
  prompt=prompt,
229
  max_new_tokens=400,
230
- temperature=0.25,
231
- repetition_penalty=1.1,
232
  )
233
- return response
234
  except Exception as e1:
235
- # If it fails, automatically switch to conversational API
236
  try:
237
  chat_response = hf_client.chat.completions.create(
238
  model=DEFAULT_MODEL,
239
- messages=[
240
- {"role": "system", "content": "You are a precise and factual handbook assistant."},
241
- {"role": "user", "content": prompt},
242
- ],
243
- max_tokens=400,
244
- temperature=0.25,
245
  )
246
  return chat_response.choices[0].message["content"]
247
  except Exception as e2:
248
  return f"⚠️ Error generating answer: {e2}"
249
 
250
 
251
- # =============================================================
252
- # πŸ” Index Handling
253
- # =============================================================
254
  def ensure_index():
255
- """Ensure FAISS index is ready (build or load)."""
256
  if regenerate_index or not os.path.exists(INDEX_FILE):
257
  pdfs = find_handbook()
258
  if not pdfs:
259
  st.stop()
260
- st.info("πŸ“„ Loading and embedding handbook...")
261
  pages = load_pdf_texts(pdfs)
262
- if not pages:
263
- st.error("No text extracted from handbook.")
264
- st.stop()
265
  chunks = chunk_text(pages, chunk_size_chars, chunk_overlap)
266
  build_faiss_index(chunks)
267
- st.success("βœ… Handbook indexed successfully.")
268
  index, meta = load_faiss_index()
269
  if index is None or meta is None:
270
- st.error("Failed to load FAISS index.")
271
  st.stop()
272
  return index, meta
273
 
274
-
275
  # =============================================================
276
  # πŸ’¬ Chat Interface
277
  # =============================================================
@@ -281,7 +267,7 @@ st.subheader("πŸ’¬ Ask about the Handbook")
281
  if "history" not in st.session_state:
282
  st.session_state.history = []
283
 
284
- user_query = st.text_input("Your question about the handbook:")
285
  index, meta = ensure_index()
286
 
287
  if st.button("Ask") and user_query.strip():
@@ -289,14 +275,14 @@ if st.button("Ask") and user_query.strip():
289
  if not results:
290
  st.warning("No relevant section found in the handbook.")
291
  else:
292
- context_text = "\n\n".join(
293
- [f"(πŸ“„ Page {r['page']} β€” {r['filename']})\n{r['content']}" for r in results]
294
  )
295
- answer = generate_answer(context_text, user_query)
296
  st.session_state.history.append({"user": user_query, "assistant": answer})
297
 
298
  for chat in st.session_state.history:
299
  st_message(chat["user"], is_user=True)
300
  st_message(chat["assistant"])
301
 
302
- st.caption("⚑ Powered by FAISS + Hugging Face Inference API + Mistral 7B")
 
1
+ # =============================================================
2
+ # πŸ“˜ USTP Student Handbook Assistant (2023 Edition)
3
+ # =============================================================
4
+ # Enhanced: dynamic model selection + real (printed) page numbering
5
+
6
  import os
 
7
  import glob
8
  import json
9
+ import time
10
  from typing import List, Dict, Any
 
11
  import numpy as np
12
  import streamlit as st
13
  import PyPDF2
 
16
  from huggingface_hub import InferenceClient, login
17
  from streamlit_chat import message as st_message
18
 
19
+ # Optional: FAISS for fast vector search
20
  try:
21
  import faiss
22
  except ImportError:
23
  faiss = None
24
 
25
  # =============================================================
26
+ # 🌐 Startup Fix for PermissionError
27
+ # =============================================================
28
+ os.environ["STREAMLIT_HOME"] = "/tmp/.streamlit"
29
+ os.makedirs("/tmp/.streamlit", exist_ok=True)
30
+
31
+ # =============================================================
32
+ # βš™οΈ Streamlit Page Setup
33
  # =============================================================
34
  st.set_page_config(page_title="πŸ“˜ Handbook Assistant", page_icon="πŸ“˜", layout="wide")
35
  st.title("πŸ“˜ USTP Student Handbook Assistant (2023 Edition)")
36
+ st.caption("Answers sourced only from the official *USTP Student Handbook 2023 Edition.pdf*.")
37
 
38
  load_dotenv()
39
  HF_TOKEN = os.getenv("HF_TOKEN")
40
 
41
  if not HF_TOKEN:
42
+ st.warning("⚠️ No Hugging Face API token found in .env file. Online models will be unavailable.")
43
  else:
44
  try:
45
  login(HF_TOKEN)
 
49
  hf_client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else None
50
 
51
  # =============================================================
52
+ # βš™οΈ Sidebar Configuration
53
  # =============================================================
54
+ with st.sidebar:
55
+ st.header("βš™οΈ Settings")
56
+
57
+ model_options = {
58
+ "Qwen 2.5 14B Instruct": "Qwen/Qwen2.5-14B-Instruct",
59
+ "Mistral 7B Instruct": "mistralai/Mistral-7B-Instruct-v0.3",
60
+ "Llama 3 8B Instruct": "meta-llama/Meta-Llama-3-8B-Instruct",
61
+ "Mixtral 8x7B Instruct": "mistralai/Mixtral-8x7B-Instruct-v0.1",
62
+ "Falcon 7B Instruct": "tiiuae/falcon-7b-instruct",
63
+ }
64
+ model_choice = st.selectbox("Select reasoning model", list(model_options.keys()), index=0)
65
+ DEFAULT_MODEL = model_options[model_choice]
66
+
67
+ st.markdown("---")
68
+ similarity_threshold = st.slider("Similarity threshold", 0.3, 1.0, 0.6, 0.01)
69
+ top_k = st.slider("Top K retrieved chunks", 1, 10, 4)
70
+ chunk_size_chars = st.number_input("Chunk size (chars)", 400, 2500, 1200, 100)
71
+ chunk_overlap = st.number_input("Chunk overlap (chars)", 20, 600, 150, 10)
72
+ front_matter_pages = st.number_input(
73
+ "Pages before main content (e.g. table of contents, cover)", min_value=0, max_value=50, value=12
74
+ )
75
+ regenerate_index = st.button("πŸ” Rebuild handbook index")
76
 
77
+ # =============================================================
78
+ # πŸ“‚ File Config
79
+ # =============================================================
80
  INDEX_FILE = "handbook_faiss.index"
81
  META_FILE = "handbook_metadata.json"
82
  EMB_DIM_FILE = "handbook_emb_dim.json"
83
+ EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"
 
 
 
 
 
 
 
84
 
85
  # =============================================================
86
  # 🧩 Utility Functions
87
  # =============================================================
 
88
  def find_handbook() -> List[str]:
 
89
  preferred = "USTP Student Handbook 2023 Edition.pdf"
90
+ pdfs = glob.glob("*.pdf")
91
+ for f in pdfs:
92
+ if preferred.lower() in f.lower():
93
+ st.success(f"πŸ“˜ Found handbook: {f}")
94
+ return [f]
 
 
 
95
  if pdfs:
96
+ st.warning(f"⚠️ Preferred handbook not found. Using {os.path.basename(pdfs[0])}.")
97
  return [pdfs[0]]
98
+ st.error("❌ No PDF found in current folder.")
 
99
  return []
100
 
101
 
102
  def load_pdf_texts(pdf_paths: List[str]) -> List[Dict[str, Any]]:
103
+ """Extract page text while adjusting page numbering to printed handbook numbers."""
104
  pages = []
105
  for path in pdf_paths:
106
  with open(path, "rb") as f:
 
108
  for i, page in enumerate(reader.pages):
109
  text = page.extract_text() or ""
110
  if text.strip():
111
+ # Adjust logical page number to printed numbering
112
+ logical_page = i + 1
113
+ printed_page = logical_page - front_matter_pages
114
+ if printed_page < 1:
115
+ printed_page = 1
116
+ pages.append({
117
+ "filename": os.path.basename(path),
118
+ "page": printed_page,
119
+ "text": text.strip()
120
+ })
121
  return pages
122
 
123
 
124
  def chunk_text(pages: List[Dict[str, Any]], size: int, overlap: int) -> List[Dict[str, Any]]:
 
125
  chunks = []
126
  for p in pages:
127
  text = p["text"]
 
138
  return chunks
139
 
140
 
 
141
  def embed_texts(texts: List[str]) -> np.ndarray:
142
+ """Generate embeddings using Hugging Face feature extraction."""
143
+ if not HF_TOKEN or not hf_client:
144
+ st.error("❌ Missing Hugging Face token or client.")
145
  return np.zeros((len(texts), 768))
 
 
146
  try:
147
+ embeddings = hf_client.feature_extraction(texts, model=EMBED_MODEL)
 
 
 
 
 
148
  if isinstance(embeddings[0][0], list):
149
  embeddings = [np.mean(np.array(e), axis=0) for e in embeddings]
 
150
  return np.array(embeddings)
 
 
151
  except Exception as e1:
152
+ st.warning(f"⚠️ feature_extraction failed, using REST API fallback: {e1}")
153
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
154
+ resp = requests.post(
155
+ f"https://api-inference.huggingface.co/models/{EMBED_MODEL}",
156
+ headers=headers,
157
+ json={"inputs": texts}
158
+ )
159
+ data = resp.json()
160
+ if isinstance(data[0][0], list):
161
+ data = [np.mean(np.array(e), axis=0) for e in data]
162
+ return np.array(data)
 
 
 
 
 
 
 
 
 
 
163
 
164
 
165
+ def build_faiss_index(chunks: List[Dict[str, Any]]):
166
+ """Build FAISS index for chunks."""
167
  texts = [c["content"] for c in chunks]
168
  embeddings = embed_texts(texts)
169
  if embeddings.size == 0:
170
+ st.error("❌ Embedding generation failed.")
171
  return
 
172
  dim = embeddings.shape[1]
173
  index = faiss.IndexFlatL2(dim)
174
  index.add(embeddings.astype("float32"))
 
175
  faiss.write_index(index, INDEX_FILE)
176
  with open(META_FILE, "w") as f:
177
  json.dump(chunks, f)
178
  with open(EMB_DIM_FILE, "w") as f:
179
  json.dump({"dim": dim}, f)
180
+ st.success(f"βœ… Indexed {len(chunks)} chunks.")
181
 
182
 
183
  def load_faiss_index():
184
+ if not os.path.exists(INDEX_FILE) or not os.path.exists(META_FILE):
 
185
  return None, None
186
  index = faiss.read_index(INDEX_FILE)
187
  with open(META_FILE) as f:
 
189
  return index, meta
190
 
191
 
192
+ def search_index(query: str, index, meta, top_k: int, threshold: float):
 
193
  query_emb = embed_texts([query])
194
  distances, indices = index.search(query_emb.astype("float32"), top_k)
195
  results = []
196
  for i, dist in zip(indices[0], distances[0]):
197
  if i < len(meta):
198
+ r = meta[i]
199
+ r["distance"] = float(dist)
200
+ results.append(r)
201
  return results
202
 
203
 
204
  def generate_answer(context: str, query: str) -> str:
205
+ """Generate model-based answer using selected open-source model."""
206
  prompt = f"""
207
+ You are a precise academic assistant specialized in university policy.
208
+ Use only the *USTP Student Handbook 2023 Edition* below.
209
+ If the answer is not in the text, reply:
210
  "The handbook does not specify that."
211
 
212
  ---
213
+ πŸ“˜ Context:
214
  {context}
215
  ---
216
+ 🧭 Question:
217
  {query}
218
  ---
219
+ 🎯 Instructions:
220
+ - Be factual and concise.
221
+ - Cite the correct printed page number.
222
+ - Never make assumptions.
223
  """
224
 
 
 
 
 
225
  try:
226
  response = hf_client.text_generation(
227
  model=DEFAULT_MODEL,
228
  prompt=prompt,
229
  max_new_tokens=400,
230
+ temperature=0.25
 
231
  )
232
+ return response if isinstance(response, str) else str(response)
233
  except Exception as e1:
 
234
  try:
235
  chat_response = hf_client.chat.completions.create(
236
  model=DEFAULT_MODEL,
237
+ messages=[{"role": "user", "content": prompt}],
238
+ max_tokens=400
 
 
 
 
239
  )
240
  return chat_response.choices[0].message["content"]
241
  except Exception as e2:
242
  return f"⚠️ Error generating answer: {e2}"
243
 
244
 
 
 
 
245
  def ensure_index():
246
+ """Ensure FAISS index exists or rebuild."""
247
  if regenerate_index or not os.path.exists(INDEX_FILE):
248
  pdfs = find_handbook()
249
  if not pdfs:
250
  st.stop()
251
+ st.info("πŸ“„ Extracting handbook text...")
252
  pages = load_pdf_texts(pdfs)
 
 
 
253
  chunks = chunk_text(pages, chunk_size_chars, chunk_overlap)
254
  build_faiss_index(chunks)
 
255
  index, meta = load_faiss_index()
256
  if index is None or meta is None:
257
+ st.error("❌ Could not load FAISS index.")
258
  st.stop()
259
  return index, meta
260
 
 
261
  # =============================================================
262
  # πŸ’¬ Chat Interface
263
  # =============================================================
 
267
  if "history" not in st.session_state:
268
  st.session_state.history = []
269
 
270
+ user_query = st.text_input("Enter your question:")
271
  index, meta = ensure_index()
272
 
273
  if st.button("Ask") and user_query.strip():
 
275
  if not results:
276
  st.warning("No relevant section found in the handbook.")
277
  else:
278
+ context = "\n\n".join(
279
+ [f"(πŸ“„ Page {r['page']})\n{r['content']}" for r in results]
280
  )
281
+ answer = generate_answer(context, user_query)
282
  st.session_state.history.append({"user": user_query, "assistant": answer})
283
 
284
  for chat in st.session_state.history:
285
  st_message(chat["user"], is_user=True)
286
  st_message(chat["assistant"])
287
 
288
+ st.caption("⚑ Powered by FAISS + Open Source Models + Accurate Page Referencing")