Starberry15 commited on
Commit
13ec6bf
Β·
verified Β·
1 Parent(s): 22206db

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +260 -33
src/streamlit_app.py CHANGED
@@ -1,40 +1,267 @@
1
- import altair as alt
 
 
 
 
2
  import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
12
 
13
- In the meantime, below is an example of what you can do with just a few lines of code:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  """
15
 
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import glob
4
+ import json
5
+ from typing import List, Dict, Any
6
  import numpy as np
 
7
  import streamlit as st
8
+ import PyPDF2
9
+ from dotenv import load_dotenv
10
+ from huggingface_hub import InferenceClient, login
11
+ from streamlit_chat import message as st_message
12
 
13
+ # =============================================================
14
+ # 🧩 Try optional imports
15
+ # =============================================================
16
+ try:
17
+ import faiss
18
+ except ImportError:
19
+ faiss = None
20
+
21
+ try:
22
+ from sentence_transformers import SentenceTransformer
23
+ except ImportError:
24
+ SentenceTransformer = None
25
+
26
+ # =============================================================
27
+ # 🌐 Environment & Page Setup
28
+ # =============================================================
29
+ st.set_page_config(page_title="πŸ“˜ Handbook Assistant", page_icon="πŸ“˜", layout="wide")
30
+ st.title("πŸ“˜ USTP Student Handbook Assistant (2023 Edition)")
31
+ st.caption("References only *USTP Student Handbook 2023 Edition.pdf* in this folder.")
32
+
33
+ load_dotenv()
34
+ HF_TOKEN = os.getenv("HF_TOKEN")
35
+
36
+ if HF_TOKEN:
37
+ try:
38
+ login(HF_TOKEN)
39
+ except Exception:
40
+ st.warning("⚠️ Could not login to Hugging Face.")
41
+ hf_client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else None
42
+
43
+ # =============================================================
44
+ # βš™οΈ Sidebar Configuration
45
+ # =============================================================
46
+ with st.sidebar:
47
+ st.header("βš™οΈ Settings")
48
+
49
+ model_choice = st.selectbox(
50
+ "Select LLM model",
51
+ [
52
+ "Qwen/Qwen2.5-14B-Instruct",
53
+ "mistralai/Mistral-7B-Instruct-v0.3",
54
+ "meta-llama/Meta-Llama-3-8B-Instruct",
55
+ "tiiuae/falcon-7b-instruct",
56
+ ],
57
+ index=0, # Default: Qwen 14B
58
+ )
59
+
60
+ similarity_threshold = st.slider("Similarity Threshold", 0.3, 1.0, 0.6, 0.01)
61
+ top_k = st.slider("Top K Results", 1, 10, 4)
62
+ chunk_size_chars = st.number_input("Chunk Size (chars)", 400, 2500, 1200, 100)
63
+ chunk_overlap = st.number_input("Chunk Overlap (chars)", 20, 600, 150, 10)
64
+ regenerate_index = st.button("πŸ” Rebuild Handbook Index")
65
+
66
+ # =============================================================
67
+ # 🧠 Utility Functions
68
+ # =============================================================
69
+
70
+ def find_handbook() -> List[str]:
71
+ preferred = "USTP Student Handbook 2023 Edition.pdf"
72
+ current_dir = os.path.dirname(os.path.abspath(__file__))
73
+ pdf_path = os.path.join(current_dir, preferred)
74
+ if os.path.exists(pdf_path):
75
+ return [pdf_path]
76
+
77
+ pdfs = glob.glob(os.path.join(current_dir, "*.pdf"))
78
+ if pdfs:
79
+ st.warning(f"⚠️ Using {os.path.basename(pdfs[0])} (preferred handbook not found)")
80
+ return [pdfs[0]]
81
+
82
+ st.error("❌ No PDF found in this folder.")
83
+ return []
84
+
85
+ def load_pdf_texts(pdf_paths: List[str]) -> List[Dict[str, Any]]:
86
+ pages = []
87
+ for path in pdf_paths:
88
+ with open(path, "rb") as f:
89
+ reader = PyPDF2.PdfReader(f)
90
+ for i, page in enumerate(reader.pages):
91
+ text = page.extract_text() or ""
92
+ if text.strip():
93
+ pages.append({"filename": os.path.basename(path), "page": i + 1, "text": text})
94
+ return pages
95
+
96
+ def chunk_text(pages: List[Dict[str, Any]], size: int, overlap: int):
97
+ chunks = []
98
+ for p in pages:
99
+ text = p["text"]
100
+ start = 0
101
+ while start < len(text):
102
+ end = start + size
103
+ chunks.append({
104
+ "filename": p["filename"],
105
+ "page": p["page"],
106
+ "content": text[start:end].strip()
107
+ })
108
+ start += size - overlap
109
+ return chunks
110
+
111
+ # =============================================================
112
+ # 🧠 Embeddings (Stable + Non-blocking)
113
+ # =============================================================
114
+
115
+ @st.cache_resource
116
+ def load_local_embedder():
117
+ """Load local embedding model safely."""
118
+ if SentenceTransformer is None:
119
+ raise ImportError("sentence-transformers not installed.")
120
+ try:
121
+ return SentenceTransformer("all-MiniLM-L6-v2")
122
+ except Exception:
123
+ try:
124
+ return SentenceTransformer("paraphrase-MiniLM-L3-v2")
125
+ except Exception as e:
126
+ st.error(f"Embedding model load failed: {e}")
127
+ return None
128
 
129
+ def embed_texts(texts: List[str]) -> np.ndarray:
130
+ """Stable fallback-first embedding generator."""
131
+ # Try local embeddings directly (fast + avoids API)
132
+ try:
133
+ model = load_local_embedder()
134
+ if model:
135
+ return model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
136
+ except Exception as e:
137
+ st.warning(f"⚠️ Local embedding failed: {e}")
138
 
139
+ # Final fallback: return zeros (still prevents freeze)
140
+ st.error("❌ Could not generate embeddings; returning empty array.")
141
+ return np.zeros((len(texts), 384), dtype="float32")
142
+
143
+ # =============================================================
144
+ # πŸ—‚οΈ FAISS Index
145
+ # =============================================================
146
+ INDEX_FILE = "handbook_faiss.index"
147
+ META_FILE = "handbook_metadata.json"
148
+
149
+ def build_faiss_index(chunks):
150
+ texts = [c["content"] for c in chunks]
151
+ embeddings = embed_texts(texts)
152
+ if embeddings.size == 0:
153
+ st.error("❌ Embedding generation failed.")
154
+ return
155
+
156
+ dim = embeddings.shape[1]
157
+ index = faiss.IndexFlatL2(dim)
158
+ index.add(embeddings.astype("float32"))
159
+ faiss.write_index(index, INDEX_FILE)
160
+ with open(META_FILE, "w") as f:
161
+ json.dump(chunks, f)
162
+
163
+ def load_faiss_index():
164
+ if not (os.path.exists(INDEX_FILE) and os.path.exists(META_FILE)):
165
+ return None, None
166
+ index = faiss.read_index(INDEX_FILE)
167
+ with open(META_FILE) as f:
168
+ meta = json.load(f)
169
+ return index, meta
170
+
171
+ # =============================================================
172
+ # πŸ” Search
173
+ # =============================================================
174
+ def search_index(query: str, index, meta, top_k: int, threshold: float):
175
+ query_emb = embed_texts([query])
176
+ distances, indices = index.search(query_emb.astype("float32"), top_k)
177
+ results = []
178
+ for i, dist in zip(indices[0], distances[0]):
179
+ if i < len(meta):
180
+ result = meta[i]
181
+ result["distance"] = float(dist)
182
+ results.append(result)
183
+ return results
184
+
185
+ # =============================================================
186
+ # πŸ’¬ Answer Generation
187
+ # =============================================================
188
+ def generate_answer(context: str, query: str, model_name: str):
189
+ prompt = f"""
190
+ You are a precise academic assistant specialized in university policies.
191
+ Use only the provided *USTP Student Handbook 2023 Edition* content as reference.
192
+ If the answer is not explicitly found, respond with:
193
+ "The handbook does not specify that."
194
+
195
+ ---
196
+ πŸ“˜ **Context (from the handbook)**:
197
+ {context}
198
+ ---
199
+ 🧭 **Question**:
200
+ {query}
201
+ ---
202
+ 🎯 **Instructions**:
203
+ - Answer concisely and factually.
204
+ - Include page numbers and filename references where relevant.
205
  """
206
 
207
+ if not hf_client:
208
+ return "❌ Hugging Face client not initialized."
209
+
210
+ try:
211
+ response = hf_client.text_generation(
212
+ model=model_name,
213
+ prompt=prompt,
214
+ max_new_tokens=400,
215
+ temperature=0.25,
216
+ repetition_penalty=1.1,
217
+ )
218
+ return response
219
+ except Exception as e:
220
+ return f"⚠️ Error generating answer: {e}"
221
+
222
+ # =============================================================
223
+ # βœ… Ensure Index Loads Immediately
224
+ # =============================================================
225
+ def ensure_index():
226
+ if regenerate_index or not os.path.exists(INDEX_FILE):
227
+ pdfs = find_handbook()
228
+ if not pdfs:
229
+ st.stop()
230
+ pages = load_pdf_texts(pdfs)
231
+ if not pages:
232
+ st.error("No text extracted.")
233
+ st.stop()
234
+ chunks = chunk_text(pages, chunk_size_chars, chunk_overlap)
235
+ build_faiss_index(chunks)
236
+ st.success("βœ… Index rebuilt.")
237
+ return load_faiss_index()
238
+
239
+ # =============================================================
240
+ # 🧠 Main Chat Interface
241
+ # =============================================================
242
+ st.divider()
243
+ st.subheader("πŸ’¬ Ask about the Handbook")
244
+
245
+ index, meta = ensure_index()
246
+
247
+ if "history" not in st.session_state:
248
+ st.session_state.history = []
249
+
250
+ user_query = st.text_input("Your question about the handbook:", key="user_input")
251
+
252
+ if st.button("Ask", key="ask_btn") and user_query.strip():
253
+ results = search_index(user_query, index, meta, top_k, similarity_threshold)
254
+ if not results:
255
+ st.warning("No relevant section found.")
256
+ else:
257
+ context_text = "\n\n".join(
258
+ [f"(πŸ“„ Page {r['page']} β€” {r['filename']})\n{r['content']}" for r in results]
259
+ )
260
+ answer = generate_answer(context_text, user_query, model_choice)
261
+ st.session_state.history.append({"user": user_query, "assistant": answer})
262
+
263
+ for i, chat in enumerate(st.session_state.history):
264
+ st_message(chat["user"], is_user=True, key=f"user_{i}")
265
+ st_message(chat["assistant"], key=f"assistant_{i}")
266
+
267
+ st.caption("⚑ Powered by FAISS + Local Embeddings + Qwen 14B")