Spaces:

Starberry15
/

Handbook-Chatbot

Sleeping

App Files Files Community

Starberry15 commited on Oct 22, 2025

Commit

f521fb7

verified ·

1 Parent(s): 95d0828

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +107 -121

src/streamlit_app.py CHANGED Viewed

@@ -1,9 +1,13 @@
 import os
-import time
 import glob
 import json
 from typing import List, Dict, Any
 import numpy as np
 import streamlit as st
 import PyPDF2
@@ -12,24 +16,30 @@ from dotenv import load_dotenv
 from huggingface_hub import InferenceClient, login
 from streamlit_chat import message as st_message
-# Try importing FAISS
 try:
     import faiss
 except ImportError:
     faiss = None
 # =============================================================
-# 🌐 Environment & Page Setup
 # =============================================================
 st.set_page_config(page_title="📘 Handbook Assistant", page_icon="📘", layout="wide")
 st.title("📘 USTP Student Handbook Assistant (2023 Edition)")
-st.caption("This assistant references only the *USTP Student Handbook 2023 Edition.pdf* located in the same folder.")
 load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN")
 if not HF_TOKEN:
-    st.warning("⚠️ HF_TOKEN not found in .env file. Hugging Face API calls will fail.")
 else:
     try:
         login(HF_TOKEN)
@@ -39,48 +49,58 @@ else:
 hf_client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else None
 # =============================================================
-# ⚙️ Configuration
 # =============================================================
-DEFAULT_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"  # strong, open, accurate
-EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"
 INDEX_FILE = "handbook_faiss.index"
 META_FILE = "handbook_metadata.json"
 EMB_DIM_FILE = "handbook_emb_dim.json"
-with st.sidebar:
-    st.header("⚙️ Settings")
-    similarity_threshold = st.slider("Similarity Threshold", 0.3, 1.0, 0.62, 0.01)
-    top_k = st.slider("Top K Results", 1, 10, 4)
-    chunk_size_chars = st.number_input("Chunk Size (chars)", 400, 2500, 1200, 100)
-    chunk_overlap = st.number_input("Chunk Overlap (chars)", 20, 600, 150, 10)
-    regenerate_index = st.button("🔁 Rebuild Handbook Index")
 # =============================================================
 # 🧩 Utility Functions
 # =============================================================
 def find_handbook() -> List[str]:
-    """Locate the handbook PDF in the same folder."""
     preferred = "USTP Student Handbook 2023 Edition.pdf"
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-    pdf_path = os.path.join(current_dir, preferred)
-    if os.path.exists(pdf_path):
-        st.info(f"📘 Found handbook: {preferred}")
-        return [pdf_path]
-    pdfs = glob.glob(os.path.join(current_dir, "*.pdf"))
     if pdfs:
-        st.warning(f"⚠️ Preferred handbook not found. Using {os.path.basename(pdfs[0])}")
         return [pdfs[0]]
-    st.error("❌ No PDF found in the same folder as this app.")
     return []
 def load_pdf_texts(pdf_paths: List[str]) -> List[Dict[str, Any]]:
-    """Extract text from all pages of provided PDFs."""
     pages = []
     for path in pdf_paths:
         with open(path, "rb") as f:
@@ -88,12 +108,20 @@ def load_pdf_texts(pdf_paths: List[str]) -> List[Dict[str, Any]]:
             for i, page in enumerate(reader.pages):
                 text = page.extract_text() or ""
                 if text.strip():
-                    pages.append({"filename": os.path.basename(path), "page": i + 1, "text": text})
     return pages
 def chunk_text(pages: List[Dict[str, Any]], size: int, overlap: int) -> List[Dict[str, Any]]:
-    """Split text into overlapping chunks."""
     chunks = []
     for p in pages:
         text = p["text"]
@@ -110,73 +138,50 @@ def chunk_text(pages: List[Dict[str, Any]], size: int, overlap: int) -> List[Dic
     return chunks
-# ✅ FIXED SECTION
 def embed_texts(texts: List[str]) -> np.ndarray:
-    """Get embeddings via Hugging Face Inference API with proper fallback."""
-    if not HF_TOKEN:
-        st.error("❌ Missing HF_TOKEN.")
         return np.zeros((len(texts), 768))
-    # --- Primary method ---
     try:
-        embeddings = hf_client.feature_extraction(
-            texts,  # ✅ positional, not keyword
-            model=EMBED_MODEL
-        )
-        # Handle token-level embedding cases
         if isinstance(embeddings[0][0], list):
             embeddings = [np.mean(np.array(e), axis=0) for e in embeddings]
         return np.array(embeddings)
-    # --- Fallback method ---
     except Exception as e1:
-        st.warning(f"⚠️ feature_extraction() failed, using REST API fallback: {e1}")
-        try:
-            headers = {"Authorization": f"Bearer {HF_TOKEN}"}
-            response = requests.post(
-                f"https://api-inference.huggingface.co/models/{EMBED_MODEL}",  # ✅ correct endpoint
-                headers=headers,
-                json={"inputs": texts}
-            )
-            response.raise_for_status()
-            data = response.json()
-            # Handle nested outputs
-            if isinstance(data[0][0], list):
-                embeddings = [np.mean(np.array(e), axis=0) for e in data]
-            else:
-                embeddings = [np.array(data)]
-            return np.array(embeddings)
-        except Exception as e2:
-            st.error(f"Embedding error: {e2}")
-            return np.zeros((len(texts), 768))
-def build_faiss_index(chunks: List[Dict[str, Any]]) -> None:
-    """Build and save FAISS index for handbook chunks."""
     texts = [c["content"] for c in chunks]
     embeddings = embed_texts(texts)
     if embeddings.size == 0:
-        st.error("Embedding generation failed; cannot build index.")
         return
     dim = embeddings.shape[1]
     index = faiss.IndexFlatL2(dim)
     index.add(embeddings.astype("float32"))
     faiss.write_index(index, INDEX_FILE)
     with open(META_FILE, "w") as f:
         json.dump(chunks, f)
     with open(EMB_DIM_FILE, "w") as f:
         json.dump({"dim": dim}, f)
 def load_faiss_index():
-    """Load FAISS index and metadata if available."""
-    if not (os.path.exists(INDEX_FILE) and os.path.exists(META_FILE)):
         return None, None
     index = faiss.read_index(INDEX_FILE)
     with open(META_FILE) as f:
@@ -184,94 +189,75 @@ def load_faiss_index():
     return index, meta
-def search_index(query: str, index, meta, top_k: int, threshold: float) -> List[Dict[str, Any]]:
-    """Search FAISS for top-K similar chunks."""
     query_emb = embed_texts([query])
     distances, indices = index.search(query_emb.astype("float32"), top_k)
     results = []
     for i, dist in zip(indices[0], distances[0]):
         if i < len(meta):
-            result = meta[i]
-            result["distance"] = float(dist)
-            results.append(result)
     return results
 def generate_answer(context: str, query: str) -> str:
-    """Generate robust answer with explicit citations — auto-switches between endpoints."""
     prompt = f"""
-You are a precise academic assistant specialized in university policies.
-Use only the provided *USTP Student Handbook 2023 Edition* content as reference.
-If the answer is not explicitly found, respond with:
 "The handbook does not specify that."
 ---
-📘 **Context (from the handbook)**:
 {context}
 ---
-🧭 **Question**:
 {query}
 ---
-🎯 **Instructions**:
-- Answer concisely and factually.
-- Include page numbers and filename references where relevant.
-- Do NOT invent or assume any information not in the handbook.
 """
-    if not hf_client:
-        return "❌ Hugging Face client not initialized."
-    # Try standard text-generation first
     try:
         response = hf_client.text_generation(
             model=DEFAULT_MODEL,
             prompt=prompt,
             max_new_tokens=400,
-            temperature=0.25,
-            repetition_penalty=1.1,
         )
-        return response
     except Exception as e1:
-        # If it fails, automatically switch to conversational API
         try:
             chat_response = hf_client.chat.completions.create(
                 model=DEFAULT_MODEL,
-                messages=[
-                    {"role": "system", "content": "You are a precise and factual handbook assistant."},
-                    {"role": "user", "content": prompt},
-                ],
-                max_tokens=400,
-                temperature=0.25,
             )
             return chat_response.choices[0].message["content"]
         except Exception as e2:
             return f"⚠️ Error generating answer: {e2}"
-# =============================================================
-# 🔍 Index Handling
-# =============================================================
 def ensure_index():
-    """Ensure FAISS index is ready (build or load)."""
     if regenerate_index or not os.path.exists(INDEX_FILE):
         pdfs = find_handbook()
         if not pdfs:
             st.stop()
-        st.info("📄 Loading and embedding handbook...")
         pages = load_pdf_texts(pdfs)
-        if not pages:
-            st.error("No text extracted from handbook.")
-            st.stop()
         chunks = chunk_text(pages, chunk_size_chars, chunk_overlap)
         build_faiss_index(chunks)
-        st.success("✅ Handbook indexed successfully.")
     index, meta = load_faiss_index()
     if index is None or meta is None:
-        st.error("Failed to load FAISS index.")
         st.stop()
     return index, meta
 # =============================================================
 # 💬 Chat Interface
 # =============================================================
@@ -281,7 +267,7 @@ st.subheader("💬 Ask about the Handbook")
 if "history" not in st.session_state:
     st.session_state.history = []
-user_query = st.text_input("Your question about the handbook:")
 index, meta = ensure_index()
 if st.button("Ask") and user_query.strip():
@@ -289,14 +275,14 @@ if st.button("Ask") and user_query.strip():
     if not results:
         st.warning("No relevant section found in the handbook.")
     else:
-        context_text = "\n\n".join(
-            [f"(📄 Page {r['page']} — {r['filename']})\n{r['content']}" for r in results]
         )
-        answer = generate_answer(context_text, user_query)
         st.session_state.history.append({"user": user_query, "assistant": answer})
 for chat in st.session_state.history:
     st_message(chat["user"], is_user=True)
     st_message(chat["assistant"])
-st.caption("⚡ Powered by FAISS + Hugging Face Inference API + Mistral 7B")

+# =============================================================
+# 📘 USTP Student Handbook Assistant (2023 Edition)
+# =============================================================
+# Enhanced: dynamic model selection + real (printed) page numbering
 import os
 import glob
 import json
+import time
 from typing import List, Dict, Any
 import numpy as np
 import streamlit as st
 import PyPDF2
 from huggingface_hub import InferenceClient, login
 from streamlit_chat import message as st_message
+# Optional: FAISS for fast vector search
 try:
     import faiss
 except ImportError:
     faiss = None
 # =============================================================
+# 🌐 Startup Fix for PermissionError
+# =============================================================
+os.environ["STREAMLIT_HOME"] = "/tmp/.streamlit"
+os.makedirs("/tmp/.streamlit", exist_ok=True)
+# =============================================================
+# ⚙️ Streamlit Page Setup
 # =============================================================
 st.set_page_config(page_title="📘 Handbook Assistant", page_icon="📘", layout="wide")
 st.title("📘 USTP Student Handbook Assistant (2023 Edition)")
+st.caption("Answers sourced only from the official *USTP Student Handbook 2023 Edition.pdf*.")
 load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN")
 if not HF_TOKEN:
+    st.warning("⚠️ No Hugging Face API token found in .env file. Online models will be unavailable.")
 else:
     try:
         login(HF_TOKEN)
 hf_client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else None
 # =============================================================
+# ⚙️ Sidebar Configuration
 # =============================================================
+with st.sidebar:
+    st.header("⚙️ Settings")
+    model_options = {
+        "Qwen 2.5 14B Instruct": "Qwen/Qwen2.5-14B-Instruct",
+        "Mistral 7B Instruct": "mistralai/Mistral-7B-Instruct-v0.3",
+        "Llama 3 8B Instruct": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "Mixtral 8x7B Instruct": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+        "Falcon 7B Instruct": "tiiuae/falcon-7b-instruct",
+    }
+    model_choice = st.selectbox("Select reasoning model", list(model_options.keys()), index=0)
+    DEFAULT_MODEL = model_options[model_choice]
+    st.markdown("---")
+    similarity_threshold = st.slider("Similarity threshold", 0.3, 1.0, 0.6, 0.01)
+    top_k = st.slider("Top K retrieved chunks", 1, 10, 4)
+    chunk_size_chars = st.number_input("Chunk size (chars)", 400, 2500, 1200, 100)
+    chunk_overlap = st.number_input("Chunk overlap (chars)", 20, 600, 150, 10)
+    front_matter_pages = st.number_input(
+        "Pages before main content (e.g. table of contents, cover)", min_value=0, max_value=50, value=12
+    )
+    regenerate_index = st.button("🔁 Rebuild handbook index")
+# =============================================================
+# 📂 File Config
+# =============================================================
 INDEX_FILE = "handbook_faiss.index"
 META_FILE = "handbook_metadata.json"
 EMB_DIM_FILE = "handbook_emb_dim.json"
+EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"
 # =============================================================
 # 🧩 Utility Functions
 # =============================================================
 def find_handbook() -> List[str]:
     preferred = "USTP Student Handbook 2023 Edition.pdf"
+    pdfs = glob.glob("*.pdf")
+    for f in pdfs:
+        if preferred.lower() in f.lower():
+            st.success(f"📘 Found handbook: {f}")
+            return [f]
     if pdfs:
+        st.warning(f"⚠️ Preferred handbook not found. Using {os.path.basename(pdfs[0])}.")
         return [pdfs[0]]
+    st.error("❌ No PDF found in current folder.")
     return []
 def load_pdf_texts(pdf_paths: List[str]) -> List[Dict[str, Any]]:
+    """Extract page text while adjusting page numbering to printed handbook numbers."""
     pages = []
     for path in pdf_paths:
         with open(path, "rb") as f:
             for i, page in enumerate(reader.pages):
                 text = page.extract_text() or ""
                 if text.strip():
+                    # Adjust logical page number to printed numbering
+                    logical_page = i + 1
+                    printed_page = logical_page - front_matter_pages
+                    if printed_page < 1:
+                        printed_page = 1
+                    pages.append({
+                        "filename": os.path.basename(path),
+                        "page": printed_page,
+                        "text": text.strip()
+                    })
     return pages
 def chunk_text(pages: List[Dict[str, Any]], size: int, overlap: int) -> List[Dict[str, Any]]:
     chunks = []
     for p in pages:
         text = p["text"]
     return chunks
 def embed_texts(texts: List[str]) -> np.ndarray:
+    """Generate embeddings using Hugging Face feature extraction."""
+    if not HF_TOKEN or not hf_client:
+        st.error("❌ Missing Hugging Face token or client.")
         return np.zeros((len(texts), 768))
     try:
+        embeddings = hf_client.feature_extraction(texts, model=EMBED_MODEL)
         if isinstance(embeddings[0][0], list):
             embeddings = [np.mean(np.array(e), axis=0) for e in embeddings]
         return np.array(embeddings)
     except Exception as e1:
+        st.warning(f"⚠️ feature_extraction failed, using REST API fallback: {e1}")
+        headers = {"Authorization": f"Bearer {HF_TOKEN}"}
+        resp = requests.post(
+            f"https://api-inference.huggingface.co/models/{EMBED_MODEL}",
+            headers=headers,
+            json={"inputs": texts}
+        )
+        data = resp.json()
+        if isinstance(data[0][0], list):
+            data = [np.mean(np.array(e), axis=0) for e in data]
+        return np.array(data)
+def build_faiss_index(chunks: List[Dict[str, Any]]):
+    """Build FAISS index for chunks."""
     texts = [c["content"] for c in chunks]
     embeddings = embed_texts(texts)
     if embeddings.size == 0:
+        st.error("❌ Embedding generation failed.")
         return
     dim = embeddings.shape[1]
     index = faiss.IndexFlatL2(dim)
     index.add(embeddings.astype("float32"))
     faiss.write_index(index, INDEX_FILE)
     with open(META_FILE, "w") as f:
         json.dump(chunks, f)
     with open(EMB_DIM_FILE, "w") as f:
         json.dump({"dim": dim}, f)
+    st.success(f"✅ Indexed {len(chunks)} chunks.")
 def load_faiss_index():
+    if not os.path.exists(INDEX_FILE) or not os.path.exists(META_FILE):
         return None, None
     index = faiss.read_index(INDEX_FILE)
     with open(META_FILE) as f:
     return index, meta
+def search_index(query: str, index, meta, top_k: int, threshold: float):
     query_emb = embed_texts([query])
     distances, indices = index.search(query_emb.astype("float32"), top_k)
     results = []
     for i, dist in zip(indices[0], distances[0]):
         if i < len(meta):
+            r = meta[i]
+            r["distance"] = float(dist)
+            results.append(r)
     return results
 def generate_answer(context: str, query: str) -> str:
+    """Generate model-based answer using selected open-source model."""
     prompt = f"""
+You are a precise academic assistant specialized in university policy.
+Use only the *USTP Student Handbook 2023 Edition* below.
+If the answer is not in the text, reply:
 "The handbook does not specify that."
 ---
+📘 Context:
 {context}
 ---
+🧭 Question:
 {query}
 ---
+🎯 Instructions:
+- Be factual and concise.
+- Cite the correct printed page number.
+- Never make assumptions.
 """
     try:
         response = hf_client.text_generation(
             model=DEFAULT_MODEL,
             prompt=prompt,
             max_new_tokens=400,
+            temperature=0.25
         )
+        return response if isinstance(response, str) else str(response)
     except Exception as e1:
         try:
             chat_response = hf_client.chat.completions.create(
                 model=DEFAULT_MODEL,
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=400
             )
             return chat_response.choices[0].message["content"]
         except Exception as e2:
             return f"⚠️ Error generating answer: {e2}"
 def ensure_index():
+    """Ensure FAISS index exists or rebuild."""
     if regenerate_index or not os.path.exists(INDEX_FILE):
         pdfs = find_handbook()
         if not pdfs:
             st.stop()
+        st.info("📄 Extracting handbook text...")
         pages = load_pdf_texts(pdfs)
         chunks = chunk_text(pages, chunk_size_chars, chunk_overlap)
         build_faiss_index(chunks)
     index, meta = load_faiss_index()
     if index is None or meta is None:
+        st.error("❌ Could not load FAISS index.")
         st.stop()
     return index, meta
 # =============================================================
 # 💬 Chat Interface
 # =============================================================
 if "history" not in st.session_state:
     st.session_state.history = []
+user_query = st.text_input("Enter your question:")
 index, meta = ensure_index()
 if st.button("Ask") and user_query.strip():
     if not results:
         st.warning("No relevant section found in the handbook.")
     else:
+        context = "\n\n".join(
+            [f"(📄 Page {r['page']})\n{r['content']}" for r in results]
         )
+        answer = generate_answer(context, user_query)
         st.session_state.history.append({"user": user_query, "assistant": answer})
 for chat in st.session_state.history:
     st_message(chat["user"], is_user=True)
     st_message(chat["assistant"])
+st.caption("⚡ Powered by FAISS + Open Source Models + Accurate Page Referencing")