File size: 3,425 Bytes
4687fa9
 
 
 
 
24deec1
4687fa9
 
 
 
 
 
 
 
 
 
5630f6b
e152803
 
 
 
5630f6b
4687fa9
e152803
4687fa9
e152803
 
 
4687fa9
 
 
 
 
e152803
 
24deec1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# ----------------------------
# Hugging Face cache bootstrap
# ----------------------------
import os

CACHE_DIR = "/home/user/huggingface"
os.makedirs(CACHE_DIR, exist_ok=True)

os.environ["HF_HOME"] = CACHE_DIR
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
os.environ["HF_MODULES_CACHE"] = CACHE_DIR

# ----------------------------
# Imports AFTER cache bootstrap
# ----------------------------
import streamlit as st
from ingestion import extract_text_from_pdf, chunk_text
from embeddings import generate_embeddings
from vectorstore import build_faiss_index
from qa import retrieve_chunks, generate_answer

# ----------------------------
# App Config
# ----------------------------
st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
st.title("πŸ“„ Enterprise Knowledge Assistant")

st.write("Upload a PDF **or try the sample file** to explore this assistant.")

# ----------------------------
# Sidebar (Settings + Credits)
# ----------------------------
with st.sidebar:
    st.image("app/logo.png", width=150)
    st.header("βš™οΈ Settings")
    chunk_size = st.slider("Chunk Size", 200, 1000, 500, step=100)
    top_k = st.slider("Top K Results", 1, 5, 3)

    st.markdown("---")
    st.caption("πŸ‘¨β€πŸ’» Built by Shubham Sharma")
    st.markdown("[πŸ“‚ GitHub Repo](https://github.com/shubhamsharma170793-cpu/enterprise-knowledge-assistant)")

# ----------------------------
# File Upload Section
# ----------------------------
uploaded_file = st.file_uploader("πŸ“‚ Upload your PDF", type="pdf")

if st.button("πŸ“˜ Try with Sample PDF"):
    uploaded_file = open("app/sample.pdf", "rb")
    st.session_state["use_sample"] = True
else:
    st.session_state["use_sample"] = False

# ----------------------------
# File Handling + Processing
# ----------------------------
if uploaded_file:
    if st.session_state.get("use_sample", False):
        temp_path = os.path.join("app", "sample.pdf")
        st.info("Using **default sample.pdf** βœ…")
    else:
        temp_path = os.path.join("temp.pdf")
        with open(temp_path, "wb") as f:
            f.write(uploaded_file.getbuffer())

    # Phase 2: Extract Text
    text = extract_text_from_pdf(temp_path)
    st.subheader("πŸ“‘ Extracted Text (Preview)")
    st.write(text[:1000])

    # Phase 3: Chunking
    chunks = chunk_text(text, chunk_size=chunk_size)
    st.write(f"πŸ“¦ Total Chunks Created: {len(chunks)}")

    st.subheader("🧩 Chunked Text (First 3 Chunks)")
    for i, chunk in enumerate(chunks[:3], start=1):
        st.write(f"**Chunk {i}:** {chunk}")

    # Phase 4: Embeddings
    embeddings = generate_embeddings(chunks)
    st.success(f"βœ… Generated {len(embeddings)} embeddings.")

    # Phase 5: Vector Store (FAISS)
    index = build_faiss_index(embeddings)

    # Phase 6 & 7: Q&A
    user_query = st.text_input("πŸ” Ask a question about the document:")

    if user_query:
        retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k)
        answer = generate_answer(user_query, retrieved)

        # Final Answer
        st.subheader("πŸ€– Assistant’s Answer")
        st.write(answer)

        # Supporting Chunks
        st.subheader("πŸ“„ Supporting Chunks")
        for i, r in enumerate(retrieved, start=1):
            st.write(f"**Chunk {i}:** {r}")
else:
    st.info("⬅️ Upload a PDF or click 'Try with Sample PDF' to begin.")