Shubham170793 commited on
Commit
6944855
Β·
verified Β·
1 Parent(s): 0b3513f

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +78 -67
src/streamlit_app.py CHANGED
@@ -1,103 +1,114 @@
1
  import os
 
 
 
 
 
2
 
 
 
 
3
  CACHE_DIR = "/tmp/hf_cache"
4
  os.makedirs(CACHE_DIR, exist_ok=True)
5
-
6
  os.environ["HF_HOME"] = CACHE_DIR
7
  os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
8
  os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
9
  os.environ["HF_MODULES_CACHE"] = CACHE_DIR
10
 
11
- print(f"βœ… Using Hugging Face cache at {CACHE_DIR}")
 
 
 
 
 
12
 
13
-
14
- # ----------------------------
15
- # Imports AFTER cache bootstrap
16
- # ----------------------------
17
- import streamlit as st
18
- from ingestion import extract_text_from_pdf, chunk_text
19
- from embeddings import generate_embeddings
20
- from vectorstore import build_faiss_index
21
- from qa import retrieve_chunks, generate_answer
22
-
23
- # ----------------------------
24
  # App Config
25
- # ----------------------------
26
  st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
27
  st.title("πŸ“„ Enterprise Knowledge Assistant")
 
28
 
29
- st.write("Upload a PDF **or try the sample file** to explore this assistant.")
30
-
31
- # ----------------------------
32
- # Sidebar (Settings + Credits)
33
- # ----------------------------
34
  with st.sidebar:
35
- st.image("src/logo.png", width=150)
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  st.header("βš™οΈ Settings")
37
  chunk_size = st.slider("Chunk Size", 200, 1000, 500, step=100)
38
  top_k = st.slider("Top K Results", 1, 5, 3)
39
 
40
  st.markdown("---")
 
 
41
  st.caption("πŸ‘¨β€πŸ’» Built by Shubham Sharma")
42
  st.markdown("[πŸ“‚ GitHub Repo](https://github.com/shubhamsharma170793-cpu/enterprise-knowledge-assistant)")
43
 
44
- # ----------------------------
45
- # File Upload Section
46
- # ----------------------------
47
- uploaded_file = st.file_uploader("πŸ“‚ Upload your PDF", type="pdf")
48
-
49
- if st.button("πŸ“˜ Try with Sample PDF"):
50
- uploaded_file = open("src/sample.pdf", "rb")
51
- st.session_state["use_sample"] = True
52
- else:
53
- st.session_state["use_sample"] = False
54
-
55
- # ----------------------------
56
- # File Handling + Processing
57
- # ----------------------------
58
- if uploaded_file:
59
- if st.session_state.get("use_sample", False):
60
- temp_path = os.path.join("app", "sample.pdf")
61
- st.info("Using **default sample.pdf** βœ…")
62
- else:
63
- temp_path = os.path.join("temp.pdf")
64
- with open(temp_path, "wb") as f:
65
- f.write(uploaded_file.getbuffer())
66
 
67
- # Phase 2: Extract Text
68
- text = extract_text_from_pdf(temp_path)
69
- st.subheader("πŸ“‘ Extracted Text (Preview)")
70
- st.write(text[:1000])
71
 
72
- # Phase 3: Chunking
 
 
 
73
  chunks = chunk_text(text, chunk_size=chunk_size)
74
- st.write(f"πŸ“¦ Total Chunks Created: {len(chunks)}")
75
-
76
- st.subheader("🧩 Chunked Text (First 3 Chunks)")
77
- for i, chunk in enumerate(chunks[:3], start=1):
78
- st.write(f"**Chunk {i}:** {chunk}")
79
-
80
- # Phase 4: Embeddings
81
  embeddings = generate_embeddings(chunks)
82
- st.success(f"βœ… Generated {len(embeddings)} embeddings.")
83
-
84
- # Phase 5: Vector Store (FAISS)
85
  index = build_faiss_index(embeddings)
86
 
87
- # Phase 6 & 7: Q&A
88
- user_query = st.text_input("πŸ” Ask a question about the document:")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
 
90
  if user_query:
91
  retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k)
92
  answer = generate_answer(user_query, retrieved)
93
 
94
- # Final Answer
95
- st.subheader("πŸ€– Assistant’s Answer")
96
  st.write(answer)
97
 
98
- # Supporting Chunks
99
- st.subheader("πŸ“„ Supporting Chunks")
100
- for i, r in enumerate(retrieved, start=1):
101
- st.write(f"**Chunk {i}:** {r}")
102
- else:
103
- st.info("⬅️ Upload a PDF or click 'Try with Sample PDF' to begin.")
 
1
  import os
2
+ import streamlit as st
3
+ from ingestion import extract_text_from_pdf, chunk_text
4
+ from embeddings import generate_embeddings
5
+ from vectorstore import build_faiss_index
6
+ from qa import retrieve_chunks, generate_answer
7
 
8
+ # ---------------------------
9
+ # Cache Fix for Hugging Face
10
+ # ---------------------------
11
  CACHE_DIR = "/tmp/hf_cache"
12
  os.makedirs(CACHE_DIR, exist_ok=True)
 
13
  os.environ["HF_HOME"] = CACHE_DIR
14
  os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
15
  os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
16
  os.environ["HF_MODULES_CACHE"] = CACHE_DIR
17
 
18
+ # ---------------------------
19
+ # Paths
20
+ # ---------------------------
21
+ BASE_DIR = os.path.dirname(__file__) # /app/src
22
+ LOGO_PATH = os.path.join(BASE_DIR, "logo.png")
23
+ SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
24
 
25
+ # ---------------------------
 
 
 
 
 
 
 
 
 
 
26
  # App Config
27
+ # ---------------------------
28
  st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
29
  st.title("πŸ“„ Enterprise Knowledge Assistant")
30
+ st.caption("Select a document from the library or upload your own, then ask questions.")
31
 
32
+ # ---------------------------
33
+ # Sidebar (Library + Settings + Credits)
34
+ # ---------------------------
 
 
35
  with st.sidebar:
36
+ if os.path.exists(LOGO_PATH):
37
+ st.image(LOGO_PATH, width=150)
38
+
39
+ # 1. Document Library
40
+ st.header("πŸ“š Document Library")
41
+ doc_choice = st.radio(
42
+ "Choose a document:",
43
+ ["-- Select --", "Sample PDF", "Upload Custom PDF"],
44
+ index=0
45
+ )
46
+
47
+ st.markdown("---")
48
+
49
+ # 2. Settings
50
  st.header("βš™οΈ Settings")
51
  chunk_size = st.slider("Chunk Size", 200, 1000, 500, step=100)
52
  top_k = st.slider("Top K Results", 1, 5, 3)
53
 
54
  st.markdown("---")
55
+
56
+ # 3. Branding
57
  st.caption("πŸ‘¨β€πŸ’» Built by Shubham Sharma")
58
  st.markdown("[πŸ“‚ GitHub Repo](https://github.com/shubhamsharma170793-cpu/enterprise-knowledge-assistant)")
59
 
60
+ # ---------------------------
61
+ # Document Handling
62
+ # ---------------------------
63
+ text, chunks, index = None, None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ if doc_choice == "-- Select --":
66
+ st.info("⬅️ Please choose **Sample PDF** or **Upload Custom PDF** from the sidebar.")
 
 
67
 
68
+ elif doc_choice == "Sample PDF":
69
+ temp_path = SAMPLE_PATH
70
+ st.success("πŸ“˜ Sample PDF selected")
71
+ text = extract_text_from_pdf(temp_path)
72
  chunks = chunk_text(text, chunk_size=chunk_size)
 
 
 
 
 
 
 
73
  embeddings = generate_embeddings(chunks)
 
 
 
74
  index = build_faiss_index(embeddings)
75
 
76
+ elif doc_choice == "Upload Custom PDF":
77
+ uploaded_file = st.file_uploader("πŸ“‚ Upload your PDF", type="pdf")
78
+ if uploaded_file:
79
+ temp_path = "temp.pdf"
80
+ with open(temp_path, "wb") as f:
81
+ f.write(uploaded_file.getbuffer())
82
+ st.success("βœ… Document uploaded and processed!")
83
+
84
+ text = extract_text_from_pdf(temp_path)
85
+ chunks = chunk_text(text, chunk_size=chunk_size)
86
+ embeddings = generate_embeddings(chunks)
87
+ index = build_faiss_index(embeddings)
88
+
89
+ # ---------------------------
90
+ # Document Preview
91
+ # ---------------------------
92
+ if chunks:
93
+ st.subheader("πŸ“‘ Document Preview")
94
+ st.text_area("Extracted text (first 1000 chars)", text[:1000], height=150)
95
+ st.caption(f"πŸ“¦ {len(chunks)} chunks created")
96
+
97
+ # ---------------------------
98
+ # Query Section
99
+ # ---------------------------
100
+ if index and chunks:
101
+ st.markdown("---")
102
+ st.subheader("πŸ€– Ask a Question")
103
 
104
+ user_query = st.text_input("πŸ” Your question about the document:")
105
  if user_query:
106
  retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k)
107
  answer = generate_answer(user_query, retrieved)
108
 
109
+ st.markdown("### βœ… Assistant’s Answer")
 
110
  st.write(answer)
111
 
112
+ with st.expander("πŸ“„ Supporting Chunks"):
113
+ for i, r in enumerate(retrieved, start=1):
114
+ st.markdown(f"**Chunk {i}:** {r}")