Shubham170793 commited on
Commit
51344d2
·
verified ·
1 Parent(s): 65116ce

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +26 -123
src/streamlit_app.py CHANGED
@@ -1,5 +1,5 @@
1
  # ==========================================================
2
- # streamlit_app.py — Stable Layout + Multilingual Enhancement (Hindi + English)
3
  # ==========================================================
4
  import os
5
  import re
@@ -32,44 +32,15 @@ from vectorstore import build_faiss_index
32
  from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
33
 
34
  # ==========================================================
35
- # 🧠 LANGUAGE DETECTION HELPER (Fast, No Dependencies)
36
  # ==========================================================
37
- from langdetect import detect
38
-
39
- def detect_language(text_sample: str) -> str:
40
  """
41
- Detects Hindi (Devanagari) or English.
42
- Returns "hi" for Hindi and "en" for English.
43
- """
44
- try:
45
- # Quick Unicode-based detection for Hindi
46
- if re.search(r"[\u0900-\u097F]", text_sample):
47
- return "hi"
48
-
49
- # Fallback to langdetect
50
- lang = detect(text_sample)
51
- return "hi" if lang.startswith("hi") else "en"
52
- except Exception:
53
- return "en"
54
-
55
-
56
-
57
- # ==========================================================
58
- # 🧠 SMART SUGGESTION GENERATOR — bilingual (Hindi + English)
59
- # ==========================================================
60
- def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document", doc_lang="en"):
61
- """
62
- Generates 5-7 short, natural questions from TOC + a sample of chunks.
63
- If doc_lang == "hi", the prompt asks the model to return questions in Hindi.
64
  """
65
  if not toc or not chunks:
66
- # sensible bilingual fallback
67
- return ["How do I start using this guide?", "What does this document cover?"] if doc_lang != "hi" else [
68
- "मैं इस गाइड का उपयोग कैसे शुरू करूँ?",
69
- "यह दस्तावेज़ क्या कवर करता है?"
70
- ]
71
 
72
- # Build candidate titles from TOC
73
  titles = []
74
  for sec, raw_title in toc:
75
  title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
@@ -78,27 +49,10 @@ def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document", doc_
78
  titles.append(title)
79
 
80
  context_sample = " ".join(chunks[:3])[:4000]
81
-
82
- # Choose language-aware prompt
83
- if str(doc_lang).startswith("hi"):
84
- prompt = f"""
85
- आप एक सामग्री सहायक हैं। नीचे दिए गए तालिका-समाचार (Table of Contents) और दस्तावेज़ के नमूना पाठ के आधार पर 5 से 7 संक्षिप्त, साफ़ और मानवीय प्रश्न बनाइए।
86
- प्रत्येक प्रश्न हिंदी में होना चाहिए, 18 शब्दों से कम, और प्रश्न चिह्न "?" के साथ समाप्त होना चाहिए। प्रश्न केवल दस्तावेज़ से प्रेरित हों — नई जानकारी इजाद न करें।
87
-
88
- दस्तावेज़: "{doc_name}"
89
-
90
- TABLE OF CONTENTS:
91
- {chr(10).join(['- ' + t for t in titles[:8]])}
92
-
93
- SAMPLE TEXT:
94
- {context_sample}
95
-
96
- आउटपुट: हर प्रश्न को नई लाइन पर लिखें, किसी भी क्रम चिन्ह के साथ (1., -, •) चलेगा। केवल प्रश्न लिखें।
97
- """
98
- else:
99
- prompt = f"""
100
- You are a content assistant. Based on the Table of Contents and the sample document text below, generate 5–7 short, natural user-facing questions.
101
- Each question should be in English, <18 words, and end with a question mark.
102
  Document: "{doc_name}"
103
 
104
  TABLE OF CONTENTS:
@@ -107,72 +61,31 @@ TABLE OF CONTENTS:
107
  SAMPLE TEXT:
108
  {context_sample}
109
 
110
- Output: Put one question per line. Do not invent facts — base questions on the document.
111
  """
112
 
113
  try:
114
  ai_response = genai_generate(prompt)
115
-
116
- # Normalize response to lines
117
  lines = [ln.strip() for ln in ai_response.splitlines() if ln.strip()]
118
-
119
- # Heuristics to extract candidate questions
120
- candidates = []
121
  for ln in lines:
122
- # remove bullet/ordinal prefixes like "1.", "-", "•"
123
- ln_clean = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
124
-
125
- # if line already ends with a question mark, keep it
126
- if ln_clean.endswith("?"):
127
- q = ln_clean
128
- else:
129
- # sometimes model returns without "?" but as a question — add "?" if short and starts with W/H or Hindi question words
130
- if (len(ln_clean.split()) < 18) and re.match(r"(?i)^(what|how|why|where|who|when|which)\b", ln_clean):
131
- q = ln_clean + "?"
132
- # Hindi question words heuristic
133
- elif re.match(r"^(क्या|क्यों|कैसे|कहाँ|कौन|किस|कब)\b", ln_clean):
134
- q = ln_clean if ln_clean.endswith("?") else ln_clean + "?"
135
- else:
136
- # skip lines that don't look like questions
137
- continue
138
-
139
- # length/filter
140
- q = q.strip()
141
  if 8 <= len(q) <= 140:
142
- candidates.append(q)
143
-
144
- # dedupe while preserving order
145
- seen = set()
146
  final = []
147
- for q in candidates:
148
- key = q.lower()
149
- if key not in seen:
150
- seen.add(key)
151
  final.append(q)
152
-
153
- # If we ended up with none, fallback to naive generation from titles
154
  if not final:
155
- # form simple question templates from titles
156
- for t in titles[:7]:
157
- if str(doc_lang).startswith("hi"):
158
- cand = t.rstrip(".") + " के बारे में क्या जानना चाहिए?"
159
- else:
160
- cand = "What should I know about " + t.rstrip(".") + "?"
161
- final.append(cand)
162
- # limit to 7
163
  return final[:7]
164
-
165
- except Exception as e:
166
- # graceful bilingual fallback
167
- if str(doc_lang).startswith("hi"):
168
- return [
169
- "इस दस्तावेज़ को कैसे शुरू करूँ?",
170
- "इस दस्तावेज़ का मुख्य उद्देश्य क्या है?",
171
- "प्रमुख हिस्से कौन से हैं?"
172
- ]
173
- else:
174
- return ["How do I start using this guide?", "What does this document cover?"]
175
-
176
 
177
  # ==========================================================
178
  # 🎨 STYLING — MINIMAL ENTERPRISE DESIGN
@@ -254,7 +167,6 @@ for key, val in {
254
  "selected_suggestion": None,
255
  "query_suggestions_fixed": None,
256
  "last_doc": None,
257
- "doc_lang": "en", # 🆕 store document language
258
  }.items():
259
  if key not in st.session_state:
260
  st.session_state[key] = val
@@ -262,7 +174,7 @@ for key, val in {
262
  def set_user_query(q, idx):
263
  st.session_state["user_query_input"] = q
264
  st.session_state["selected_suggestion"] = idx
265
- st.experimental_rerun()
266
 
267
  # ==========================================================
268
  # 📄 MAIN SECTION
@@ -296,20 +208,13 @@ else:
296
  text, toc, toc_source = extract_text_from_pdf(temp_path)
297
  chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
298
 
299
- # 🌐 Detect document language (robust multilingual)
300
- doc_sample = " ".join(chunks[:3])[:3000]
301
- doc_lang = detect_language(doc_sample)
302
- st.session_state["doc_lang"] = doc_lang
303
- lang_label = "Hindi" if doc_lang.startswith("hi") else "English"
304
- st.caption(f"🈹 Detected document language: {lang_label}")
305
-
306
  with st.spinner("⚙️ Building search index..."):
307
  embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
308
  index = build_faiss_index(embeddings)
309
 
310
  doc_name = os.path.basename(temp_path)
311
  if st.session_state["last_doc"] != doc_name:
312
- query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name, doc_lang)
313
  st.session_state["query_suggestions_fixed"] = query_suggestions
314
  st.session_state["last_doc"] = doc_name
315
  st.session_state["user_query_input"] = ""
@@ -345,9 +250,7 @@ else:
345
  reasoning_mode = mode == "Extended (Document + general)"
346
  with st.spinner("💭 Generating your answer..."):
347
  retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
348
- doc_lang = st.session_state.get("doc_lang", "en")
349
- print("🧠 Document language used for GPT prompt:", doc_lang)
350
- answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode, doc_lang=doc_lang)
351
 
352
  st.markdown("### 🤖 Assistant’s Answer")
353
 
 
1
  # ==========================================================
2
+ # streamlit_app.py — Stable Layout (English Only)
3
  # ==========================================================
4
  import os
5
  import re
 
32
  from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
33
 
34
  # ==========================================================
35
+ # 🧠 SMART SUGGESTION GENERATOR (English Only)
36
  # ==========================================================
37
+ def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
 
 
38
  """
39
+ Generates 5–7 short, natural English questions based on TOC and document text.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  """
41
  if not toc or not chunks:
42
+ return ["How do I start using this guide?", "What does this document cover?"]
 
 
 
 
43
 
 
44
  titles = []
45
  for sec, raw_title in toc:
46
  title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
 
49
  titles.append(title)
50
 
51
  context_sample = " ".join(chunks[:3])[:4000]
52
+ prompt = f"""
53
+ You are a content assistant. Based on the Table of Contents and the sample document text below,
54
+ generate 5–7 short, natural user-facing questions.
55
+ Each question should be under 18 words, end with a question mark, and sound human.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  Document: "{doc_name}"
57
 
58
  TABLE OF CONTENTS:
 
61
  SAMPLE TEXT:
62
  {context_sample}
63
 
64
+ Output: Write each question on a new line. Do not invent facts — base questions only on the document.
65
  """
66
 
67
  try:
68
  ai_response = genai_generate(prompt)
 
 
69
  lines = [ln.strip() for ln in ai_response.splitlines() if ln.strip()]
70
+ questions = []
 
 
71
  for ln in lines:
72
+ q = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
73
+ if not q.endswith("?") and len(q.split()) < 18 and re.match(r"(?i)^(what|how|why|where|who|when|which|can|does|is|are)\b", q):
74
+ q += "?"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  if 8 <= len(q) <= 140:
76
+ questions.append(q)
77
+ # dedupe
 
 
78
  final = []
79
+ seen = set()
80
+ for q in questions:
81
+ if q.lower() not in seen:
82
+ seen.add(q.lower())
83
  final.append(q)
 
 
84
  if not final:
85
+ final = [f"What should I know about {t.rstrip('.')}?" for t in titles[:7]]
 
 
 
 
 
 
 
86
  return final[:7]
87
+ except Exception:
88
+ return ["How do I start using this guide?", "What does this document cover?"]
 
 
 
 
 
 
 
 
 
 
89
 
90
  # ==========================================================
91
  # 🎨 STYLING — MINIMAL ENTERPRISE DESIGN
 
167
  "selected_suggestion": None,
168
  "query_suggestions_fixed": None,
169
  "last_doc": None,
 
170
  }.items():
171
  if key not in st.session_state:
172
  st.session_state[key] = val
 
174
  def set_user_query(q, idx):
175
  st.session_state["user_query_input"] = q
176
  st.session_state["selected_suggestion"] = idx
177
+ st.rerun()
178
 
179
  # ==========================================================
180
  # 📄 MAIN SECTION
 
208
  text, toc, toc_source = extract_text_from_pdf(temp_path)
209
  chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
210
 
 
 
 
 
 
 
 
211
  with st.spinner("⚙️ Building search index..."):
212
  embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
213
  index = build_faiss_index(embeddings)
214
 
215
  doc_name = os.path.basename(temp_path)
216
  if st.session_state["last_doc"] != doc_name:
217
+ query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name)
218
  st.session_state["query_suggestions_fixed"] = query_suggestions
219
  st.session_state["last_doc"] = doc_name
220
  st.session_state["user_query_input"] = ""
 
250
  reasoning_mode = mode == "Extended (Document + general)"
251
  with st.spinner("💭 Generating your answer..."):
252
  retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
253
+ answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
 
 
254
 
255
  st.markdown("### 🤖 Assistant’s Answer")
256