Update src/streamlit_app.py
Browse files- src/streamlit_app.py +103 -22
src/streamlit_app.py
CHANGED
|
@@ -55,43 +55,124 @@ def detect_language(text_sample: str) -> str:
|
|
| 55 |
|
| 56 |
|
| 57 |
# ==========================================================
|
| 58 |
-
# 🧠 SMART SUGGESTION GENERATOR
|
| 59 |
# ==========================================================
|
| 60 |
-
def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
if not toc or not chunks:
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
titles = []
|
| 64 |
for sec, raw_title in toc:
|
| 65 |
title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
|
| 66 |
title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
|
| 67 |
if 4 < len(title) < 120:
|
| 68 |
titles.append(title)
|
|
|
|
| 69 |
context_sample = " ".join(chunks[:3])[:4000]
|
| 70 |
-
prompt = f"""
|
| 71 |
-
You are generating short, natural, and context-aware questions for users reading "{doc_name}".
|
| 72 |
-
Use the Table of Contents and some document text for inspiration.
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
Generate 5–7 clear and human-like questions based strictly on this document.
|
| 81 |
-
Each should sound natural, under 18 words, and avoid robotic phrasing.
|
| 82 |
-
"""
|
| 83 |
try:
|
| 84 |
ai_response = genai_generate(prompt)
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
final.append(q)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
return final[:7]
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
# ==========================================================
|
| 97 |
# 🎨 STYLING — MINIMAL ENTERPRISE DESIGN
|
|
@@ -228,7 +309,7 @@ else:
|
|
| 228 |
|
| 229 |
doc_name = os.path.basename(temp_path)
|
| 230 |
if st.session_state["last_doc"] != doc_name:
|
| 231 |
-
query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name)
|
| 232 |
st.session_state["query_suggestions_fixed"] = query_suggestions
|
| 233 |
st.session_state["last_doc"] = doc_name
|
| 234 |
st.session_state["user_query_input"] = ""
|
|
|
|
| 55 |
|
| 56 |
|
| 57 |
# ==========================================================
|
| 58 |
+
# 🧠 SMART SUGGESTION GENERATOR — bilingual (Hindi + English)
|
| 59 |
# ==========================================================
|
| 60 |
+
def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document", doc_lang="en"):
|
| 61 |
+
"""
|
| 62 |
+
Generates 5-7 short, natural questions from TOC + a sample of chunks.
|
| 63 |
+
If doc_lang == "hi", the prompt asks the model to return questions in Hindi.
|
| 64 |
+
"""
|
| 65 |
if not toc or not chunks:
|
| 66 |
+
# sensible bilingual fallback
|
| 67 |
+
return ["How do I start using this guide?", "What does this document cover?"] if doc_lang != "hi" else [
|
| 68 |
+
"मैं इस गाइड का उपयोग कैसे शुरू करूँ?",
|
| 69 |
+
"यह दस्तावेज़ क्या कवर करता है?"
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
# Build candidate titles from TOC
|
| 73 |
titles = []
|
| 74 |
for sec, raw_title in toc:
|
| 75 |
title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
|
| 76 |
title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
|
| 77 |
if 4 < len(title) < 120:
|
| 78 |
titles.append(title)
|
| 79 |
+
|
| 80 |
context_sample = " ".join(chunks[:3])[:4000]
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
+
# Choose language-aware prompt
|
| 83 |
+
if str(doc_lang).startswith("hi"):
|
| 84 |
+
prompt = f"""
|
| 85 |
+
आप एक सामग्री सहायक हैं। नीचे दिए गए तालिका-समाचार (Table of Contents) और दस्तावेज़ के नमूना पाठ के आधार पर 5 से 7 संक्षिप्त, साफ़ और मानवीय प्रश्न बनाइए।
|
| 86 |
+
प्रत्येक प्रश्न हिंदी में होना चाहिए, 18 शब्दों से कम, और प्रश्न चिह्न "?" के साथ समाप्त होना चाहिए। प्रश्न केवल दस्तावेज़ से प्रेरित हों — नई जानकारी इजाद न करें।
|
| 87 |
|
| 88 |
+
दस्तावेज़: "{doc_name}"
|
| 89 |
+
|
| 90 |
+
TABLE OF CONTENTS:
|
| 91 |
+
{chr(10).join(['- ' + t for t in titles[:8]])}
|
| 92 |
+
|
| 93 |
+
SAMPLE TEXT:
|
| 94 |
+
{context_sample}
|
| 95 |
+
|
| 96 |
+
आउटपुट: हर प्रश्न को नई लाइन पर लिखें, किसी भी क्रम चिन्ह के साथ (1., -, •) चलेगा। केवल प्रश्न लिखें।
|
| 97 |
+
"""
|
| 98 |
+
else:
|
| 99 |
+
prompt = f"""
|
| 100 |
+
You are a content assistant. Based on the Table of Contents and the sample document text below, generate 5–7 short, natural user-facing questions.
|
| 101 |
+
Each question should be in English, <18 words, and end with a question mark.
|
| 102 |
+
Document: "{doc_name}"
|
| 103 |
+
|
| 104 |
+
TABLE OF CONTENTS:
|
| 105 |
+
{chr(10).join(['- ' + t for t in titles[:8]])}
|
| 106 |
+
|
| 107 |
+
SAMPLE TEXT:
|
| 108 |
+
{context_sample}
|
| 109 |
+
|
| 110 |
+
Output: Put one question per line. Do not invent facts — base questions on the document.
|
| 111 |
+
"""
|
| 112 |
|
|
|
|
|
|
|
|
|
|
| 113 |
try:
|
| 114 |
ai_response = genai_generate(prompt)
|
| 115 |
+
|
| 116 |
+
# Normalize response to lines
|
| 117 |
+
lines = [ln.strip() for ln in ai_response.splitlines() if ln.strip()]
|
| 118 |
+
|
| 119 |
+
# Heuristics to extract candidate questions
|
| 120 |
+
candidates = []
|
| 121 |
+
for ln in lines:
|
| 122 |
+
# remove bullet/ordinal prefixes like "1.", "-", "•"
|
| 123 |
+
ln_clean = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
|
| 124 |
+
|
| 125 |
+
# if line already ends with a question mark, keep it
|
| 126 |
+
if ln_clean.endswith("?"):
|
| 127 |
+
q = ln_clean
|
| 128 |
+
else:
|
| 129 |
+
# sometimes model returns without "?" but as a question — add "?" if short and starts with W/H or Hindi question words
|
| 130 |
+
if (len(ln_clean.split()) < 18) and re.match(r"(?i)^(what|how|why|where|who|when|which)\b", ln_clean):
|
| 131 |
+
q = ln_clean + "?"
|
| 132 |
+
# Hindi question words heuristic
|
| 133 |
+
elif re.match(r"^(क्या|क्यों|कैसे|कहाँ|कौन|किस|कब)\b", ln_clean):
|
| 134 |
+
q = ln_clean if ln_clean.endswith("?") else ln_clean + "?"
|
| 135 |
+
else:
|
| 136 |
+
# skip lines that don't look like questions
|
| 137 |
+
continue
|
| 138 |
+
|
| 139 |
+
# length/filter
|
| 140 |
+
q = q.strip()
|
| 141 |
+
if 8 <= len(q) <= 140:
|
| 142 |
+
candidates.append(q)
|
| 143 |
+
|
| 144 |
+
# dedupe while preserving order
|
| 145 |
+
seen = set()
|
| 146 |
+
final = []
|
| 147 |
+
for q in candidates:
|
| 148 |
+
key = q.lower()
|
| 149 |
+
if key not in seen:
|
| 150 |
+
seen.add(key)
|
| 151 |
final.append(q)
|
| 152 |
+
|
| 153 |
+
# If we ended up with none, fallback to naive generation from titles
|
| 154 |
+
if not final:
|
| 155 |
+
# form simple question templates from titles
|
| 156 |
+
for t in titles[:7]:
|
| 157 |
+
if str(doc_lang).startswith("hi"):
|
| 158 |
+
cand = t.rstrip(".") + " के बारे में क्या जानना चाहिए?"
|
| 159 |
+
else:
|
| 160 |
+
cand = "What should I know about " + t.rstrip(".") + "?"
|
| 161 |
+
final.append(cand)
|
| 162 |
+
# limit to 7
|
| 163 |
return final[:7]
|
| 164 |
+
|
| 165 |
+
except Exception as e:
|
| 166 |
+
# graceful bilingual fallback
|
| 167 |
+
if str(doc_lang).startswith("hi"):
|
| 168 |
+
return [
|
| 169 |
+
"इस दस्तावेज़ को कैसे शुरू करूँ?",
|
| 170 |
+
"इस दस्तावेज़ का मुख्य उद्देश्य क्या है?",
|
| 171 |
+
"प्रमुख हिस्से कौन से हैं?"
|
| 172 |
+
]
|
| 173 |
+
else:
|
| 174 |
+
return ["How do I start using this guide?", "What does this document cover?"]
|
| 175 |
+
|
| 176 |
|
| 177 |
# ==========================================================
|
| 178 |
# 🎨 STYLING — MINIMAL ENTERPRISE DESIGN
|
|
|
|
| 309 |
|
| 310 |
doc_name = os.path.basename(temp_path)
|
| 311 |
if st.session_state["last_doc"] != doc_name:
|
| 312 |
+
query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name, doc_lang)
|
| 313 |
st.session_state["query_suggestions_fixed"] = query_suggestions
|
| 314 |
st.session_state["last_doc"] = doc_name
|
| 315 |
st.session_state["user_query_input"] = ""
|