File size: 4,107 Bytes
4687fa9 6944855 e9c70f2 4687fa9 6944855 dbc6ce8 4687fa9 e9c70f2 6944855 54be71f 6944855 e152803 6944855 e152803 6944855 e152803 6944855 e152803 6944855 24deec1 6944855 24deec1 6944855 24deec1 6944855 24deec1 6944855 24deec1 6944855 0cacffd 6944855 24deec1 6944855 24deec1 6944855 24deec1 6944855 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import os
import streamlit as st
# --- Streamlit safe options (prevents upload 403 / CORS issues) ---
st.set_option("client.showErrorDetails", True)
# ---------------------------
# Cache Fix for Hugging Face
# ---------------------------
CACHE_DIR = "/tmp/hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
os.environ["HF_HOME"] = CACHE_DIR
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
os.environ["HF_MODULES_CACHE"] = CACHE_DIR
# ---------------------------
# Imports AFTER environment setup
# ---------------------------
from ingestion import extract_text_from_pdf, chunk_text
from embeddings import generate_embeddings
from vectorstore import build_faiss_index
from qa import retrieve_chunks, generate_answer
# ---------------------------
# Paths
# ---------------------------
BASE_DIR = os.path.dirname(__file__) # /app/src
LOGO_PATH = os.path.join(BASE_DIR, "logo.png")
SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
# ---------------------------
# App Config
# ---------------------------
st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
st.title("π Enterprise Knowledge Assistant")
st.caption("Select a document from the library or upload your own, then ask questions.")
# ---------------------------
# Sidebar (Library + Settings + Credits)
# ---------------------------
with st.sidebar:
if os.path.exists(LOGO_PATH):
st.image(LOGO_PATH, width=150)
# 1. Document Library
st.header("π Document Library")
doc_choice = st.radio(
"Choose a document:",
["-- Select --", "Sample PDF", "Upload Custom PDF"],
index=0
)
st.markdown("---")
# 2. Settings
st.header("βοΈ Settings")
chunk_size = st.slider("Chunk Size", 200, 1000, 500, step=100)
top_k = st.slider("Top K Results", 1, 5, 3)
st.markdown("---")
# 3. Branding
st.caption("π¨βπ» Built by Shubham Sharma")
st.markdown("[π GitHub Repo](https://github.com/shubhamsharma170793-cpu/enterprise-knowledge-assistant)")
# ---------------------------
# Document Handling
# ---------------------------
text, chunks, index = None, None, None
if doc_choice == "-- Select --":
st.info("β¬
οΈ Please choose **Sample PDF** or **Upload Custom PDF** from the sidebar.")
elif doc_choice == "Sample PDF":
temp_path = SAMPLE_PATH
st.success("π Sample PDF selected")
text = extract_text_from_pdf(temp_path)
chunks = chunk_text(text, chunk_size=chunk_size)
embeddings = generate_embeddings(chunks)
index = build_faiss_index(embeddings)
elif doc_choice == "Upload Custom PDF":
uploaded_file = st.file_uploader("π Upload your PDF", type="pdf")
if uploaded_file:
# Always write to /tmp (the only guaranteed writable folder)
temp_path = os.path.join("/tmp", uploaded_file.name)
with open(temp_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.success("β
Document uploaded and processed!")
text = extract_text_from_pdf(temp_path)
chunks = chunk_text(text, chunk_size=chunk_size)
embeddings = generate_embeddings(chunks)
index = build_faiss_index(embeddings)
# ---------------------------
# Document Preview
# ---------------------------
if chunks:
st.subheader("π Document Preview")
st.text_area("Extracted text (first 1000 chars)", text[:1000], height=150)
st.caption(f"π¦ {len(chunks)} chunks created")
# ---------------------------
# Query Section
# ---------------------------
if index and chunks:
st.markdown("---")
st.subheader("π€ Ask a Question")
user_query = st.text_input("π Your question about the document:")
if user_query:
retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k)
answer = generate_answer(user_query, retrieved)
st.markdown("### β
Assistantβs Answer")
st.write(answer)
with st.expander("π Supporting Chunks"):
for i, r in enumerate(retrieved, start=1):
st.markdown(f"**Chunk {i}:** {r}")
|