Spaces:

anneee266333
/

jd

Sleeping

App Files Files Community

anneee266333 commited on Oct 21, 2025

Commit

863c39b

verified ·

1 Parent(s): 93f0d23

Create app.py

Browse files

Files changed (1) hide show

app.py +295 -0

app.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import os
+import io
+import hashlib
+from typing import List
+import streamlit as st
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from langchain_community.llms import HuggingFacePipeline
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough, RunnableLambda
+from langchain_core.prompts import ChatPromptTemplate
+from pypdf import PdfReader
+from streamlit_chat import message
+# --------------------------
+# App Config
+# --------------------------
+st.set_page_config(
+    page_title="Simple QA - Built-in PDF",
+    page_icon="📘",
+    layout="wide"
+)
+DEFAULT_MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
+DEFAULT_MAX_NEW_TOKENS = 256
+DEFAULT_TEMPERATURE = 0.2
+SYSTEM_PROMPT = (
+    "You are a careful assistant for question answering. "
+    "Use ONLY the provided context to answer. "
+    "If the answer is not in the context, say you don't know."
+)
+# --------------------------
+# Utilities
+# --------------------------
+def read_pdf_bytes_to_text(file_like: io.BytesIO) -> str:
+    file_like.seek(0)
+    reader = PdfReader(file_like)
+    texts = []
+    for page in reader.pages:
+        texts.append(page.extract_text() or "")
+    return "\n".join(texts)
+def compute_texts_hash(texts: List[str]) -> str:
+    data = "\n".join(texts)
+    return hashlib.sha256(data.encode("utf-8")).hexdigest()
+def format_docs(docs):
+    return "\n\n".join(f"[{i+1}] {d.page_content}" for i, d in enumerate(docs))
+# --------------------------
+# Caches
+# --------------------------
+@st.cache_resource(show_spinner=True)
+def get_embeddings():
+    return HuggingFaceEmbeddings(
+        model_name="sentence-transformers/all-MiniLM-L6-v2",
+        model_kwargs={"device": "cpu"}
+    )
+@st.cache_resource(show_spinner=True)
+def load_llm(
+    model_id=DEFAULT_MODEL_ID,
+    temperature=DEFAULT_TEMPERATURE,
+    max_new_tokens=DEFAULT_MAX_NEW_TOKENS
+):
+    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch.float32,
+        low_cpu_mem_usage=True
+    )
+    gen = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        device=-1,
+        temperature=temperature,
+        max_new_tokens=max_new_tokens,
+        repetition_penalty=1.1,
+        pad_token_id=tokenizer.eos_token_id,
+        return_full_text=False,
+    )
+    return HuggingFacePipeline(pipeline=gen)
+def build_faiss_index(texts: List[str], chunk_size=800, chunk_overlap=120):
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap
+    )
+    docs = splitter.create_documents(texts)
+    emb = get_embeddings()
+    vs = FAISS.from_documents(docs, embedding=emb)
+    return vs
+def make_rag_chain(retriever, llm):
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", SYSTEM_PROMPT),
+        ("human", "Context:\n{context}\n\nQuestion: {question}")
+    ])
+    chain = (
+        {
+            "context": retriever | RunnableLambda(format_docs),
+            "question": RunnablePassthrough()
+        }
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
+    return chain
+# --------------------------
+# UI
+# --------------------------
+st.title("📘 Simple QA with Built-in Handbook PDF")
+with st.sidebar:
+    st.header("⚙️ Model Settings")
+    model_id = st.text_input("Model ID", value=DEFAULT_MODEL_ID)
+    temperature = st.slider("Temperature", 0.0, 1.0, DEFAULT_TEMPERATURE, 0.05)
+    max_new_tokens = st.slider("Max new tokens", 32, 1024, DEFAULT_MAX_NEW_TOKENS, 32)
+    chunk_size = st.slider("Chunk size (chars)", 200, 1500, 800, 50)
+    chunk_overlap = st.slider("Chunk overlap (chars)", 0, 400, 120, 10)
+# --------------------------
+# Build Knowledge Base Automatically
+# --------------------------
+st.subheader("📚 Knowledge Base")
+st.info("Using built-in handbook PDF as the knowledge base")
+pdf_path = "USTP Student Handbook 2023 Edition.pdf"  # must be in the same folder
+if not os.path.exists(pdf_path):
+    st.error("handbook.pdf not found. Please place it in the same folder as this app.")
+else:
+    with open(pdf_path, "rb") as f:
+        texts = [read_pdf_bytes_to_text(f)]
+    kb_hash = compute_texts_hash(texts)
+    with st.spinner("Building FAISS index..."):
+        vs = build_faiss_index(texts, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    st.session_state["kb_hash"] = kb_hash
+    st.session_state["vector store"] = vs
+    st.success("Knowledge base built successfully!")
+# --------------------------
+# Conversational Q&A Section
+# --------------------------
+st.subheader("💬 Chat with the Student Handbook")
+# Initialize chat history
+if "messages" not in st.session_state:
+    st.session_state["messages"] = [
+        {"role": "assistant", "content": "Hi 👋! Ask me anything about the student handbook."}
+    ]
+# Display chat bubbles
+for i, msg in enumerate(st.session_state["messages"]):
+    message(
+        msg["content"],
+        is_user=(msg["role"] == "user"),
+        key=f"{i}_{msg['role']}",
+        avatar_style="big-smile" if msg["role"] == "user" else "bottts"
+    )
+# Input box for user
+with st.form(key="chat_form", clear_on_submit=True):
+    question = st.text_input(
+        "💬 Type your question:",
+        placeholder="e.g. What are the rules for student discipline?",
+        key="chat_input"
+    )
+    submitted = st.form_submit_button("Send")
+show_sources = st.checkbox("📖 Show retrieved chunks", value=True)
+# Load LLM
+if "llm" not in st.session_state:
+    with st.spinner("Loading model..."):
+        st.session_state["llm"] = load_llm(model_id, temperature, max_new_tokens)
+# Handle user question
+if submitted and question:
+    st.session_state["messages"].append({"role": "user", "content": question})
+    if "vector store" not in st.session_state:
+        st.warning("Knowledge base not built yet.")
+    else:
+        vs = st.session_state["vector store"]
+        llm = st.session_state["llm"]
+        retriever = vs.as_retriever(search_type="similarity", search_kwargs={"k": 3})
+        chain = make_rag_chain(retriever, llm)
+        with st.spinner("Thinking..."):
+            answer = chain.invoke(question)
+        st.session_state["messages"].append({"role": "assistant", "content": answer})
+        docs = retriever.vectorstore.similarity_search(question, k=3)
+        if docs and show_sources:
+            st.markdown("### 📚 Retrieved Chunks")
+            for i, d in enumerate(docs, start=1):
+                with st.expander(f"Chunk [{i}]"):
+                    st.write(d.page_content[:800])
+        st.rerun()
+# --------------------------
+# Styling
+# --------------------------
+st.markdown("""
+<style>
+/* Overall background */
+.stApp {
+    background-color: #f4f4ea;
+    font-family: 'Segoe UI', sans-serif;
+}
+/* Sidebar styling */
+section[data-testid="stSidebar"] {
+    background-color: #e2e1f5;
+    color: black;
+}
+/* Buttons */
+div.stButton > button {
+    background-color: #4a4a4a;
+    color: white;
+    border-radius: 8px;
+    font-size: 16px;
+}
+div.stButton > button:hover {
+    background-color: #2980b9;
+}
+h1, h2, h3 {
+    color: #2c3e50;
+}
+/* ---- Continuous Chat Background Fix ---- */
+/* Remove vertical gaps between chat messages */
+[data-testid="stVerticalBlock"] {
+    padding: 0 !important;
+    margin: 0 !important;
+}
+/* Prevent white padding above chat */
+div[data-testid="stVerticalBlock"] > div:nth-child(1) {
+    margin-top: 0 !important;
+}
+/* Chat message bubble styles */
+[class*="stChatMessage"] {
+    background-color: #f7f7f0 !important;
+    border-radius: 16px;
+    padding: 10px 16px !important;
+    margin-bottom: 4px !important;
+}
+/* User bubble color */
+[class*="stChatMessageUser"] {
+    background-color: #e6f0ff !important;
+}
+/* Assistant bubble color */
+[class*="stChatMessageAssistant"] {
+    background-color: #f0f0f0 !important;
+}
+/* Optional: smooth continuous background */
+.stApp {
+    background: linear-gradient(to bottom, #f4f4ea 0%, #f4f4ea 100%);
+}
+</style>
+""", unsafe_allow_html=True)