MansoorSarookh commited on
Commit
87ecdf6
·
verified ·
1 Parent(s): 968c651

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -0
app.py CHANGED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import os
3
+ import streamlit as st
4
+ from typing import List, Dict
5
+ from utils import (
6
+ extract_text, chunk_text, load_embedding_model, embed_texts,
7
+ get_vector_store, build_doc_store, generate_answer
8
+ )
9
+ import json
10
+ import time
11
+ import base64
12
+
13
+ # -------------------------
14
+ # App config
15
+ # -------------------------
16
+ st.set_page_config(page_title="Retrieval QA (Streamlit + Qdrant)", layout="wide")
17
+
18
+ # Simple session memory structure:
19
+ if "chat_history" not in st.session_state:
20
+ st.session_state.chat_history = [] # list of dicts: {"role":"user/assistant/system", "text": "..."}
21
+ if "kb_store" not in st.session_state:
22
+ st.session_state.kb_store = None # actual vector store instance
23
+ if "docs_indexed" not in st.session_state:
24
+ st.session_state.docs_indexed = [] # metadata list of docs we added
25
+
26
+ # Load vector store (prefer Qdrant if env var present)
27
+ if st.session_state.kb_store is None:
28
+ try:
29
+ st.session_state.kb_store = get_vector_store(prefer_qdrant=True)
30
+ st.session_state.kb_type = type(st.session_state.kb_store).__name__
31
+ except Exception as e:
32
+ st.session_state.kb_store = get_vector_store(prefer_qdrant=False)
33
+ st.session_state.kb_type = type(st.session_state.kb_store).__name__
34
+
35
+ st.sidebar.title("Settings")
36
+ st.sidebar.write(f"Vector store: **{st.session_state.kb_type}**")
37
+ st.sidebar.write("Embedding model: `sentence-transformers/all-MiniLM-L6-v2`")
38
+ st.sidebar.write("Generator model: `google/flan-t5-small` (lightweight)")
39
+
40
+ # -------------------------
41
+ # UI - Upload files & options
42
+ # -------------------------
43
+ st.title("Retrieval QA — Upload PDF/DOCX and Ask Questions")
44
+ st.markdown("Upload PDF or DOCX files (≤100 pages). The app chunks documents, stores embeddings, and answers questions using a lightweight open-source model.")
45
+
46
+ with st.expander("Upload documents"):
47
+ uploaded = st.file_uploader("Upload PDF or DOCX files (multiple)", accept_multiple_files=True, type=["pdf", "docx"])
48
+ if uploaded:
49
+ for file in uploaded:
50
+ name = file.name
51
+ bytestr = file.read()
52
+ try:
53
+ text = extract_text(name, bytestr)
54
+ except Exception as e:
55
+ st.error(f"Failed to extract text from {name}: {e}")
56
+ continue
57
+ # enforce page/size check (best-effort)
58
+ if len(text.splitlines()) < 1:
59
+ st.warning(f"No text found in {name}. Skipping.")
60
+ continue
61
+ # Build knowledge base
62
+ with st.spinner(f"Indexing {name} ..."):
63
+ added = build_doc_store(text, st.session_state.kb_store, chunk_size=1000, overlap=200, source_name=name)
64
+ st.session_state.docs_indexed.append({"name": name, "chunks": len(added)})
65
+ st.success(f"Indexed {name}: {len(added)} chunks")
66
+
67
+ st.markdown("---")
68
+ col1, col2 = st.columns([3,1])
69
+ with col1:
70
+ st.subheader("Chat")
71
+ query = st.text_input("Ask a question about the uploaded documents", placeholder="Type something like: 'What are the main conclusions?'")
72
+ ask_button = st.button("Ask")
73
+ with col2:
74
+ st.subheader("Controls")
75
+ clear = st.button("Clear conversation")
76
+ export = st.button("Export chat (JSON)")
77
+
78
+ if clear:
79
+ st.session_state.chat_history = []
80
+ st.success("Cleared conversation")
81
+
82
+ if export:
83
+ payload = {"chat": st.session_state.chat_history, "indexed_docs": st.session_state.docs_indexed}
84
+ b = json.dumps(payload, indent=2).encode("utf-8")
85
+ b64 = base64.b64encode(b).decode()
86
+ href = f'<a href="data:application/json;base64,{b64}" download="chat_export.json">Download chat JSON</a>'
87
+ st.markdown(href, unsafe_allow_html=True)
88
+
89
+ # -------------------------
90
+ # Query handling
91
+ # -------------------------
92
+ if ask_button and query:
93
+ st.session_state.chat_history.append({"role":"user", "text": query, "time": time.time()})
94
+ # 1. compute embedding for query
95
+ embed_model = load_embedding_model()
96
+ q_emb = embed_model.encode([query], convert_to_numpy=True)[0]
97
+
98
+ # 2. retrieve top chunks
99
+ top_k = 5
100
+ hits = st.session_state.kb_store.query(q_emb, top_k=top_k)
101
+
102
+ # 3. build RAG prompt: include the retrieved context pieces
103
+ context_texts = [h[2] for h in hits] # h = (id, score, text, metadata)
104
+ combined_context = "\n\n---\n\n".join(context_texts)
105
+ rag_prompt = f"""You are a helpful assistant. Use the following extracted context from uploaded documents to answer the user's question. If the answer is not in the context, say 'I could not find the answer in the provided documents.' Context:\n\n{combined_context}\n\nQuestion: {query}\n\nAnswer concisely but thoroughly."""
106
+ # 4. generate answer
107
+ with st.spinner("Generating answer..."):
108
+ answer = generate_answer(rag_prompt, max_length=256)
109
+ st.session_state.chat_history.append({"role":"assistant", "text": answer, "time": time.time(), "source_chunks": [{"score": h[1], "metadata": h[3]} for h in hits]})
110
+
111
+ # -------------------------
112
+ # Display chat history
113
+ # -------------------------
114
+ for msg in st.session_state.chat_history:
115
+ role = msg.get("role", "user")
116
+ if role == "user":
117
+ st.markdown(f"**You:** {msg['text']}")
118
+ else:
119
+ st.markdown(f"**Assistant:** {msg['text']}")
120
+ if "source_chunks" in msg and msg["source_chunks"]:
121
+ with st.expander("Sources / metadata"):
122
+ for s in msg["source_chunks"]:
123
+ st.write(s)
124
+
125
+ st.markdown("---")
126
+ st.subheader("Indexed Documents")
127
+ if st.session_state.docs_indexed:
128
+ for d in st.session_state.docs_indexed:
129
+ st.write(f"- {d['name']} — chunks: {d['chunks']}")
130
+ else:
131
+ st.write("No documents indexed yet.")
132
+
133
+ st.markdown("### Notes & Tips")
134
+ st.write("""
135
+ - For quick Colab testing you can run this app in a Colab cell using `streamlit run app.py` or run the main functions directly in Python.
136
+ - For production or Hugging Face Spaces, set up Qdrant (or Qdrant Cloud) and set `QDRANT_URL` env var to point to it.
137
+ - To improve results: increase chunk overlap, use a stronger embedding or generator model, or add chain-of-thought/history summarization.
138
+ """)