Update src/streamlit_app.py
Browse files- src/streamlit_app.py +26 -123
src/streamlit_app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
# ==========================================================
|
| 2 |
-
# streamlit_app.py — Stable Layout
|
| 3 |
# ==========================================================
|
| 4 |
import os
|
| 5 |
import re
|
|
@@ -32,44 +32,15 @@ from vectorstore import build_faiss_index
|
|
| 32 |
from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
|
| 33 |
|
| 34 |
# ==========================================================
|
| 35 |
-
# 🧠
|
| 36 |
# ==========================================================
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def detect_language(text_sample: str) -> str:
|
| 40 |
"""
|
| 41 |
-
|
| 42 |
-
Returns "hi" for Hindi and "en" for English.
|
| 43 |
-
"""
|
| 44 |
-
try:
|
| 45 |
-
# Quick Unicode-based detection for Hindi
|
| 46 |
-
if re.search(r"[\u0900-\u097F]", text_sample):
|
| 47 |
-
return "hi"
|
| 48 |
-
|
| 49 |
-
# Fallback to langdetect
|
| 50 |
-
lang = detect(text_sample)
|
| 51 |
-
return "hi" if lang.startswith("hi") else "en"
|
| 52 |
-
except Exception:
|
| 53 |
-
return "en"
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
# ==========================================================
|
| 58 |
-
# 🧠 SMART SUGGESTION GENERATOR — bilingual (Hindi + English)
|
| 59 |
-
# ==========================================================
|
| 60 |
-
def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document", doc_lang="en"):
|
| 61 |
-
"""
|
| 62 |
-
Generates 5-7 short, natural questions from TOC + a sample of chunks.
|
| 63 |
-
If doc_lang == "hi", the prompt asks the model to return questions in Hindi.
|
| 64 |
"""
|
| 65 |
if not toc or not chunks:
|
| 66 |
-
|
| 67 |
-
return ["How do I start using this guide?", "What does this document cover?"] if doc_lang != "hi" else [
|
| 68 |
-
"मैं इस गाइड का उपयोग कैसे शुरू करूँ?",
|
| 69 |
-
"यह दस्तावेज़ क्या कवर करता है?"
|
| 70 |
-
]
|
| 71 |
|
| 72 |
-
# Build candidate titles from TOC
|
| 73 |
titles = []
|
| 74 |
for sec, raw_title in toc:
|
| 75 |
title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
|
|
@@ -78,27 +49,10 @@ def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document", doc_
|
|
| 78 |
titles.append(title)
|
| 79 |
|
| 80 |
context_sample = " ".join(chunks[:3])[:4000]
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
आप एक सामग्री सहायक हैं। नीचे दिए गए तालिका-समाचार (Table of Contents) और दस्तावेज़ के नमूना पाठ के आधार पर 5 से 7 संक्षिप्त, साफ़ और मानवीय प्रश्न बनाइए।
|
| 86 |
-
प्रत्येक प्रश्न हिंदी में होना चाहिए, 18 शब्दों से कम, और प्रश्न चिह्न "?" के साथ समाप्त होना चाहिए। प्रश्न केवल दस्तावेज़ से प्रेरित हों — नई जानकारी इजाद न करें।
|
| 87 |
-
|
| 88 |
-
दस्तावेज़: "{doc_name}"
|
| 89 |
-
|
| 90 |
-
TABLE OF CONTENTS:
|
| 91 |
-
{chr(10).join(['- ' + t for t in titles[:8]])}
|
| 92 |
-
|
| 93 |
-
SAMPLE TEXT:
|
| 94 |
-
{context_sample}
|
| 95 |
-
|
| 96 |
-
आउटपुट: हर प्रश्न को नई लाइन पर लिखें, किसी भी क्रम चिन्ह के साथ (1., -, •) चलेगा। केवल प्रश्न लिखें।
|
| 97 |
-
"""
|
| 98 |
-
else:
|
| 99 |
-
prompt = f"""
|
| 100 |
-
You are a content assistant. Based on the Table of Contents and the sample document text below, generate 5–7 short, natural user-facing questions.
|
| 101 |
-
Each question should be in English, <18 words, and end with a question mark.
|
| 102 |
Document: "{doc_name}"
|
| 103 |
|
| 104 |
TABLE OF CONTENTS:
|
|
@@ -107,72 +61,31 @@ TABLE OF CONTENTS:
|
|
| 107 |
SAMPLE TEXT:
|
| 108 |
{context_sample}
|
| 109 |
|
| 110 |
-
Output:
|
| 111 |
"""
|
| 112 |
|
| 113 |
try:
|
| 114 |
ai_response = genai_generate(prompt)
|
| 115 |
-
|
| 116 |
-
# Normalize response to lines
|
| 117 |
lines = [ln.strip() for ln in ai_response.splitlines() if ln.strip()]
|
| 118 |
-
|
| 119 |
-
# Heuristics to extract candidate questions
|
| 120 |
-
candidates = []
|
| 121 |
for ln in lines:
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
# if line already ends with a question mark, keep it
|
| 126 |
-
if ln_clean.endswith("?"):
|
| 127 |
-
q = ln_clean
|
| 128 |
-
else:
|
| 129 |
-
# sometimes model returns without "?" but as a question — add "?" if short and starts with W/H or Hindi question words
|
| 130 |
-
if (len(ln_clean.split()) < 18) and re.match(r"(?i)^(what|how|why|where|who|when|which)\b", ln_clean):
|
| 131 |
-
q = ln_clean + "?"
|
| 132 |
-
# Hindi question words heuristic
|
| 133 |
-
elif re.match(r"^(क्या|क्यों|कैसे|कहाँ|कौन|किस|कब)\b", ln_clean):
|
| 134 |
-
q = ln_clean if ln_clean.endswith("?") else ln_clean + "?"
|
| 135 |
-
else:
|
| 136 |
-
# skip lines that don't look like questions
|
| 137 |
-
continue
|
| 138 |
-
|
| 139 |
-
# length/filter
|
| 140 |
-
q = q.strip()
|
| 141 |
if 8 <= len(q) <= 140:
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
# dedupe while preserving order
|
| 145 |
-
seen = set()
|
| 146 |
final = []
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
if
|
| 150 |
-
seen.add(
|
| 151 |
final.append(q)
|
| 152 |
-
|
| 153 |
-
# If we ended up with none, fallback to naive generation from titles
|
| 154 |
if not final:
|
| 155 |
-
|
| 156 |
-
for t in titles[:7]:
|
| 157 |
-
if str(doc_lang).startswith("hi"):
|
| 158 |
-
cand = t.rstrip(".") + " के बारे में क्या जानना चाहिए?"
|
| 159 |
-
else:
|
| 160 |
-
cand = "What should I know about " + t.rstrip(".") + "?"
|
| 161 |
-
final.append(cand)
|
| 162 |
-
# limit to 7
|
| 163 |
return final[:7]
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
# graceful bilingual fallback
|
| 167 |
-
if str(doc_lang).startswith("hi"):
|
| 168 |
-
return [
|
| 169 |
-
"इस दस्तावेज़ को कैसे शुरू करूँ?",
|
| 170 |
-
"इस दस्तावेज़ का मुख्य उद्देश्य क्या है?",
|
| 171 |
-
"प्रमुख हिस्से कौन से हैं?"
|
| 172 |
-
]
|
| 173 |
-
else:
|
| 174 |
-
return ["How do I start using this guide?", "What does this document cover?"]
|
| 175 |
-
|
| 176 |
|
| 177 |
# ==========================================================
|
| 178 |
# 🎨 STYLING — MINIMAL ENTERPRISE DESIGN
|
|
@@ -254,7 +167,6 @@ for key, val in {
|
|
| 254 |
"selected_suggestion": None,
|
| 255 |
"query_suggestions_fixed": None,
|
| 256 |
"last_doc": None,
|
| 257 |
-
"doc_lang": "en", # 🆕 store document language
|
| 258 |
}.items():
|
| 259 |
if key not in st.session_state:
|
| 260 |
st.session_state[key] = val
|
|
@@ -262,7 +174,7 @@ for key, val in {
|
|
| 262 |
def set_user_query(q, idx):
|
| 263 |
st.session_state["user_query_input"] = q
|
| 264 |
st.session_state["selected_suggestion"] = idx
|
| 265 |
-
st.
|
| 266 |
|
| 267 |
# ==========================================================
|
| 268 |
# 📄 MAIN SECTION
|
|
@@ -296,20 +208,13 @@ else:
|
|
| 296 |
text, toc, toc_source = extract_text_from_pdf(temp_path)
|
| 297 |
chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
|
| 298 |
|
| 299 |
-
# 🌐 Detect document language (robust multilingual)
|
| 300 |
-
doc_sample = " ".join(chunks[:3])[:3000]
|
| 301 |
-
doc_lang = detect_language(doc_sample)
|
| 302 |
-
st.session_state["doc_lang"] = doc_lang
|
| 303 |
-
lang_label = "Hindi" if doc_lang.startswith("hi") else "English"
|
| 304 |
-
st.caption(f"🈹 Detected document language: {lang_label}")
|
| 305 |
-
|
| 306 |
with st.spinner("⚙️ Building search index..."):
|
| 307 |
embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
|
| 308 |
index = build_faiss_index(embeddings)
|
| 309 |
|
| 310 |
doc_name = os.path.basename(temp_path)
|
| 311 |
if st.session_state["last_doc"] != doc_name:
|
| 312 |
-
query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name
|
| 313 |
st.session_state["query_suggestions_fixed"] = query_suggestions
|
| 314 |
st.session_state["last_doc"] = doc_name
|
| 315 |
st.session_state["user_query_input"] = ""
|
|
@@ -345,9 +250,7 @@ else:
|
|
| 345 |
reasoning_mode = mode == "Extended (Document + general)"
|
| 346 |
with st.spinner("💭 Generating your answer..."):
|
| 347 |
retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
|
| 348 |
-
|
| 349 |
-
print("🧠 Document language used for GPT prompt:", doc_lang)
|
| 350 |
-
answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode, doc_lang=doc_lang)
|
| 351 |
|
| 352 |
st.markdown("### 🤖 Assistant’s Answer")
|
| 353 |
|
|
|
|
| 1 |
# ==========================================================
|
| 2 |
+
# streamlit_app.py — Stable Layout (English Only)
|
| 3 |
# ==========================================================
|
| 4 |
import os
|
| 5 |
import re
|
|
|
|
| 32 |
from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
|
| 33 |
|
| 34 |
# ==========================================================
|
| 35 |
+
# 🧠 SMART SUGGESTION GENERATOR (English Only)
|
| 36 |
# ==========================================================
|
| 37 |
+
def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
|
|
|
|
|
|
|
| 38 |
"""
|
| 39 |
+
Generates 5–7 short, natural English questions based on TOC and document text.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
"""
|
| 41 |
if not toc or not chunks:
|
| 42 |
+
return ["How do I start using this guide?", "What does this document cover?"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
|
|
|
| 44 |
titles = []
|
| 45 |
for sec, raw_title in toc:
|
| 46 |
title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
|
|
|
|
| 49 |
titles.append(title)
|
| 50 |
|
| 51 |
context_sample = " ".join(chunks[:3])[:4000]
|
| 52 |
+
prompt = f"""
|
| 53 |
+
You are a content assistant. Based on the Table of Contents and the sample document text below,
|
| 54 |
+
generate 5–7 short, natural user-facing questions.
|
| 55 |
+
Each question should be under 18 words, end with a question mark, and sound human.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
Document: "{doc_name}"
|
| 57 |
|
| 58 |
TABLE OF CONTENTS:
|
|
|
|
| 61 |
SAMPLE TEXT:
|
| 62 |
{context_sample}
|
| 63 |
|
| 64 |
+
Output: Write each question on a new line. Do not invent facts — base questions only on the document.
|
| 65 |
"""
|
| 66 |
|
| 67 |
try:
|
| 68 |
ai_response = genai_generate(prompt)
|
|
|
|
|
|
|
| 69 |
lines = [ln.strip() for ln in ai_response.splitlines() if ln.strip()]
|
| 70 |
+
questions = []
|
|
|
|
|
|
|
| 71 |
for ln in lines:
|
| 72 |
+
q = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
|
| 73 |
+
if not q.endswith("?") and len(q.split()) < 18 and re.match(r"(?i)^(what|how|why|where|who|when|which|can|does|is|are)\b", q):
|
| 74 |
+
q += "?"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
if 8 <= len(q) <= 140:
|
| 76 |
+
questions.append(q)
|
| 77 |
+
# dedupe
|
|
|
|
|
|
|
| 78 |
final = []
|
| 79 |
+
seen = set()
|
| 80 |
+
for q in questions:
|
| 81 |
+
if q.lower() not in seen:
|
| 82 |
+
seen.add(q.lower())
|
| 83 |
final.append(q)
|
|
|
|
|
|
|
| 84 |
if not final:
|
| 85 |
+
final = [f"What should I know about {t.rstrip('.')}?" for t in titles[:7]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
return final[:7]
|
| 87 |
+
except Exception:
|
| 88 |
+
return ["How do I start using this guide?", "What does this document cover?"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
# ==========================================================
|
| 91 |
# 🎨 STYLING — MINIMAL ENTERPRISE DESIGN
|
|
|
|
| 167 |
"selected_suggestion": None,
|
| 168 |
"query_suggestions_fixed": None,
|
| 169 |
"last_doc": None,
|
|
|
|
| 170 |
}.items():
|
| 171 |
if key not in st.session_state:
|
| 172 |
st.session_state[key] = val
|
|
|
|
| 174 |
def set_user_query(q, idx):
|
| 175 |
st.session_state["user_query_input"] = q
|
| 176 |
st.session_state["selected_suggestion"] = idx
|
| 177 |
+
st.rerun()
|
| 178 |
|
| 179 |
# ==========================================================
|
| 180 |
# 📄 MAIN SECTION
|
|
|
|
| 208 |
text, toc, toc_source = extract_text_from_pdf(temp_path)
|
| 209 |
chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
|
| 210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
with st.spinner("⚙️ Building search index..."):
|
| 212 |
embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
|
| 213 |
index = build_faiss_index(embeddings)
|
| 214 |
|
| 215 |
doc_name = os.path.basename(temp_path)
|
| 216 |
if st.session_state["last_doc"] != doc_name:
|
| 217 |
+
query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name)
|
| 218 |
st.session_state["query_suggestions_fixed"] = query_suggestions
|
| 219 |
st.session_state["last_doc"] = doc_name
|
| 220 |
st.session_state["user_query_input"] = ""
|
|
|
|
| 250 |
reasoning_mode = mode == "Extended (Document + general)"
|
| 251 |
with st.spinner("💭 Generating your answer..."):
|
| 252 |
retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
|
| 253 |
+
answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
|
|
|
|
|
|
|
| 254 |
|
| 255 |
st.markdown("### 🤖 Assistant’s Answer")
|
| 256 |
|