Spaces:

Starberry15
/

Handbook-Chatbot

Sleeping

App Files Files Community

Starberry15 commited on Oct 21, 2025

Commit

13ec6bf

verified ·

1 Parent(s): 22206db

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +260 -33

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,267 @@
-import altair as alt
 import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
 """
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+import os
+import time
+import glob
+import json
+from typing import List, Dict, Any
 import numpy as np
 import streamlit as st
+import PyPDF2
+from dotenv import load_dotenv
+from huggingface_hub import InferenceClient, login
+from streamlit_chat import message as st_message
+# =============================================================
+# 🧩 Try optional imports
+# =============================================================
+try:
+    import faiss
+except ImportError:
+    faiss = None
+try:
+    from sentence_transformers import SentenceTransformer
+except ImportError:
+    SentenceTransformer = None
+# =============================================================
+# 🌐 Environment & Page Setup
+# =============================================================
+st.set_page_config(page_title="📘 Handbook Assistant", page_icon="📘", layout="wide")
+st.title("📘 USTP Student Handbook Assistant (2023 Edition)")
+st.caption("References only *USTP Student Handbook 2023 Edition.pdf* in this folder.")
+load_dotenv()
+HF_TOKEN = os.getenv("HF_TOKEN")
+if HF_TOKEN:
+    try:
+        login(HF_TOKEN)
+    except Exception:
+        st.warning("⚠️ Could not login to Hugging Face.")
+hf_client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else None
+# =============================================================
+# ⚙️ Sidebar Configuration
+# =============================================================
+with st.sidebar:
+    st.header("⚙️ Settings")
+    model_choice = st.selectbox(
+        "Select LLM model",
+        [
+            "Qwen/Qwen2.5-14B-Instruct",
+            "mistralai/Mistral-7B-Instruct-v0.3",
+            "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tiiuae/falcon-7b-instruct",
+        ],
+        index=0,  # Default: Qwen 14B
+    )
+    similarity_threshold = st.slider("Similarity Threshold", 0.3, 1.0, 0.6, 0.01)
+    top_k = st.slider("Top K Results", 1, 10, 4)
+    chunk_size_chars = st.number_input("Chunk Size (chars)", 400, 2500, 1200, 100)
+    chunk_overlap = st.number_input("Chunk Overlap (chars)", 20, 600, 150, 10)
+    regenerate_index = st.button("🔁 Rebuild Handbook Index")
+# =============================================================
+# 🧠 Utility Functions
+# =============================================================
+def find_handbook() -> List[str]:
+    preferred = "USTP Student Handbook 2023 Edition.pdf"
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    pdf_path = os.path.join(current_dir, preferred)
+    if os.path.exists(pdf_path):
+        return [pdf_path]
+    pdfs = glob.glob(os.path.join(current_dir, "*.pdf"))
+    if pdfs:
+        st.warning(f"⚠️ Using {os.path.basename(pdfs[0])} (preferred handbook not found)")
+        return [pdfs[0]]
+    st.error("❌ No PDF found in this folder.")
+    return []
+def load_pdf_texts(pdf_paths: List[str]) -> List[Dict[str, Any]]:
+    pages = []
+    for path in pdf_paths:
+        with open(path, "rb") as f:
+            reader = PyPDF2.PdfReader(f)
+            for i, page in enumerate(reader.pages):
+                text = page.extract_text() or ""
+                if text.strip():
+                    pages.append({"filename": os.path.basename(path), "page": i + 1, "text": text})
+    return pages
+def chunk_text(pages: List[Dict[str, Any]], size: int, overlap: int):
+    chunks = []
+    for p in pages:
+        text = p["text"]
+        start = 0
+        while start < len(text):
+            end = start + size
+            chunks.append({
+                "filename": p["filename"],
+                "page": p["page"],
+                "content": text[start:end].strip()
+            })
+            start += size - overlap
+    return chunks
+# =============================================================
+# 🧠 Embeddings (Stable + Non-blocking)
+# =============================================================
+@st.cache_resource
+def load_local_embedder():
+    """Load local embedding model safely."""
+    if SentenceTransformer is None:
+        raise ImportError("sentence-transformers not installed.")
+    try:
+        return SentenceTransformer("all-MiniLM-L6-v2")
+    except Exception:
+        try:
+            return SentenceTransformer("paraphrase-MiniLM-L3-v2")
+        except Exception as e:
+            st.error(f"Embedding model load failed: {e}")
+            return None
+def embed_texts(texts: List[str]) -> np.ndarray:
+    """Stable fallback-first embedding generator."""
+    # Try local embeddings directly (fast + avoids API)
+    try:
+        model = load_local_embedder()
+        if model:
+            return model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
+    except Exception as e:
+        st.warning(f"⚠️ Local embedding failed: {e}")
+    # Final fallback: return zeros (still prevents freeze)
+    st.error("❌ Could not generate embeddings; returning empty array.")
+    return np.zeros((len(texts), 384), dtype="float32")
+# =============================================================
+# 🗂️ FAISS Index
+# =============================================================
+INDEX_FILE = "handbook_faiss.index"
+META_FILE = "handbook_metadata.json"
+def build_faiss_index(chunks):
+    texts = [c["content"] for c in chunks]
+    embeddings = embed_texts(texts)
+    if embeddings.size == 0:
+        st.error("❌ Embedding generation failed.")
+        return
+    dim = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dim)
+    index.add(embeddings.astype("float32"))
+    faiss.write_index(index, INDEX_FILE)
+    with open(META_FILE, "w") as f:
+        json.dump(chunks, f)
+def load_faiss_index():
+    if not (os.path.exists(INDEX_FILE) and os.path.exists(META_FILE)):
+        return None, None
+    index = faiss.read_index(INDEX_FILE)
+    with open(META_FILE) as f:
+        meta = json.load(f)
+    return index, meta
+# =============================================================
+# 🔍 Search
+# =============================================================
+def search_index(query: str, index, meta, top_k: int, threshold: float):
+    query_emb = embed_texts([query])
+    distances, indices = index.search(query_emb.astype("float32"), top_k)
+    results = []
+    for i, dist in zip(indices[0], distances[0]):
+        if i < len(meta):
+            result = meta[i]
+            result["distance"] = float(dist)
+            results.append(result)
+    return results
+# =============================================================
+# 💬 Answer Generation
+# =============================================================
+def generate_answer(context: str, query: str, model_name: str):
+    prompt = f"""
+You are a precise academic assistant specialized in university policies.
+Use only the provided *USTP Student Handbook 2023 Edition* content as reference.
+If the answer is not explicitly found, respond with:
+"The handbook does not specify that."
+---
+📘 **Context (from the handbook)**:
+{context}
+---
+🧭 **Question**:
+{query}
+---
+🎯 **Instructions**:
+- Answer concisely and factually.
+- Include page numbers and filename references where relevant.
 """
+    if not hf_client:
+        return "❌ Hugging Face client not initialized."
+    try:
+        response = hf_client.text_generation(
+            model=model_name,
+            prompt=prompt,
+            max_new_tokens=400,
+            temperature=0.25,
+            repetition_penalty=1.1,
+        )
+        return response
+    except Exception as e:
+        return f"⚠️ Error generating answer: {e}"
+# =============================================================
+# ✅ Ensure Index Loads Immediately
+# =============================================================
+def ensure_index():
+    if regenerate_index or not os.path.exists(INDEX_FILE):
+        pdfs = find_handbook()
+        if not pdfs:
+            st.stop()
+        pages = load_pdf_texts(pdfs)
+        if not pages:
+            st.error("No text extracted.")
+            st.stop()
+        chunks = chunk_text(pages, chunk_size_chars, chunk_overlap)
+        build_faiss_index(chunks)
+        st.success("✅ Index rebuilt.")
+    return load_faiss_index()
+# =============================================================
+# 🧠 Main Chat Interface
+# =============================================================
+st.divider()
+st.subheader("💬 Ask about the Handbook")
+index, meta = ensure_index()
+if "history" not in st.session_state:
+    st.session_state.history = []
+user_query = st.text_input("Your question about the handbook:", key="user_input")
+if st.button("Ask", key="ask_btn") and user_query.strip():
+    results = search_index(user_query, index, meta, top_k, similarity_threshold)
+    if not results:
+        st.warning("No relevant section found.")
+    else:
+        context_text = "\n\n".join(
+            [f"(📄 Page {r['page']} — {r['filename']})\n{r['content']}" for r in results]
+        )
+        answer = generate_answer(context_text, user_query, model_choice)
+        st.session_state.history.append({"user": user_query, "assistant": answer})
+for i, chat in enumerate(st.session_state.history):
+    st_message(chat["user"], is_user=True, key=f"user_{i}")
+    st_message(chat["assistant"], key=f"assistant_{i}")
+st.caption("⚡ Powered by FAISS + Local Embeddings + Qwen 14B")