File size: 3,379 Bytes
4687fa9 dbc6ce8 4687fa9 54be71f 2578631 4687fa9 5630f6b e152803 5630f6b 4687fa9 e152803 4687fa9 e152803 4687fa9 e152803 239be87 24deec1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import os
CACHE_DIR = "/tmp/hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
os.environ["HF_HOME"] = CACHE_DIR
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
os.environ["HF_MODULES_CACHE"] = CACHE_DIR
print(f"β
Using Hugging Face cache at {CACHE_DIR}")
# ----------------------------
# Imports AFTER cache bootstrap
# ----------------------------
import streamlit as st
from ingestion import extract_text_from_pdf, chunk_text
from embeddings import generate_embeddings
from vectorstore import build_faiss_index
from qa import retrieve_chunks, generate_answer
# ----------------------------
# App Config
# ----------------------------
st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
st.title("π Enterprise Knowledge Assistant")
st.write("Upload a PDF **or try the sample file** to explore this assistant.")
# ----------------------------
# Sidebar (Settings + Credits)
# ----------------------------
with st.sidebar:
st.image("src/logo.png", width=150)
st.header("βοΈ Settings")
chunk_size = st.slider("Chunk Size", 200, 1000, 500, step=100)
top_k = st.slider("Top K Results", 1, 5, 3)
st.markdown("---")
st.caption("π¨βπ» Built by Shubham Sharma")
st.markdown("[π GitHub Repo](https://github.com/shubhamsharma170793-cpu/enterprise-knowledge-assistant)")
# ----------------------------
# File Upload Section
# ----------------------------
uploaded_file = st.file_uploader("π Upload your PDF", type="pdf")
if st.button("π Try with Sample PDF"):
uploaded_file = open("app/sample.pdf", "rb")
st.session_state["use_sample"] = True
else:
st.session_state["use_sample"] = False
# ----------------------------
# File Handling + Processing
# ----------------------------
if uploaded_file:
if st.session_state.get("use_sample", False):
temp_path = os.path.join("app", "sample.pdf")
st.info("Using **default sample.pdf** β
")
else:
temp_path = os.path.join("temp.pdf")
with open(temp_path, "wb") as f:
f.write(uploaded_file.getbuffer())
# Phase 2: Extract Text
text = extract_text_from_pdf(temp_path)
st.subheader("π Extracted Text (Preview)")
st.write(text[:1000])
# Phase 3: Chunking
chunks = chunk_text(text, chunk_size=chunk_size)
st.write(f"π¦ Total Chunks Created: {len(chunks)}")
st.subheader("π§© Chunked Text (First 3 Chunks)")
for i, chunk in enumerate(chunks[:3], start=1):
st.write(f"**Chunk {i}:** {chunk}")
# Phase 4: Embeddings
embeddings = generate_embeddings(chunks)
st.success(f"β
Generated {len(embeddings)} embeddings.")
# Phase 5: Vector Store (FAISS)
index = build_faiss_index(embeddings)
# Phase 6 & 7: Q&A
user_query = st.text_input("π Ask a question about the document:")
if user_query:
retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k)
answer = generate_answer(user_query, retrieved)
# Final Answer
st.subheader("π€ Assistantβs Answer")
st.write(answer)
# Supporting Chunks
st.subheader("π Supporting Chunks")
for i, r in enumerate(retrieved, start=1):
st.write(f"**Chunk {i}:** {r}")
else:
st.info("β¬
οΈ Upload a PDF or click 'Try with Sample PDF' to begin.")
|