Shubham170793 commited on
Commit
1ffa2bc
·
verified ·
1 Parent(s): 418ad1d

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +103 -22
src/streamlit_app.py CHANGED
@@ -55,43 +55,124 @@ def detect_language(text_sample: str) -> str:
55
 
56
 
57
  # ==========================================================
58
- # 🧠 SMART SUGGESTION GENERATOR
59
  # ==========================================================
60
- def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
 
 
 
 
61
  if not toc or not chunks:
62
- return []
 
 
 
 
 
 
63
  titles = []
64
  for sec, raw_title in toc:
65
  title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
66
  title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
67
  if 4 < len(title) < 120:
68
  titles.append(title)
 
69
  context_sample = " ".join(chunks[:3])[:4000]
70
- prompt = f"""
71
- You are generating short, natural, and context-aware questions for users reading "{doc_name}".
72
- Use the Table of Contents and some document text for inspiration.
73
 
74
- TABLE OF CONTENTS:
75
- {chr(10).join(['- ' + t for t in titles[:8]])}
 
 
 
76
 
77
- SAMPLE TEXT:
78
- {context_sample}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- Generate 5–7 clear and human-like questions based strictly on this document.
81
- Each should sound natural, under 18 words, and avoid robotic phrasing.
82
- """
83
  try:
84
  ai_response = genai_generate(prompt)
85
- questions = re.findall(r"[-•]?\s*(.+?)\?", ai_response)
86
- clean_qs = [q.strip("•-— ").strip() + "?" for q in questions if 8 < len(q) < 120]
87
- seen, final = set(), []
88
- for q in clean_qs:
89
- if q.lower() not in seen:
90
- seen.add(q.lower())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  final.append(q)
 
 
 
 
 
 
 
 
 
 
 
92
  return final[:7]
93
- except Exception:
94
- return ["How do I start using this guide?", "What does this document cover?"]
 
 
 
 
 
 
 
 
 
 
95
 
96
  # ==========================================================
97
  # 🎨 STYLING — MINIMAL ENTERPRISE DESIGN
@@ -228,7 +309,7 @@ else:
228
 
229
  doc_name = os.path.basename(temp_path)
230
  if st.session_state["last_doc"] != doc_name:
231
- query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name)
232
  st.session_state["query_suggestions_fixed"] = query_suggestions
233
  st.session_state["last_doc"] = doc_name
234
  st.session_state["user_query_input"] = ""
 
55
 
56
 
57
  # ==========================================================
58
+ # 🧠 SMART SUGGESTION GENERATOR — bilingual (Hindi + English)
59
  # ==========================================================
60
+ def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document", doc_lang="en"):
61
+ """
62
+ Generates 5-7 short, natural questions from TOC + a sample of chunks.
63
+ If doc_lang == "hi", the prompt asks the model to return questions in Hindi.
64
+ """
65
  if not toc or not chunks:
66
+ # sensible bilingual fallback
67
+ return ["How do I start using this guide?", "What does this document cover?"] if doc_lang != "hi" else [
68
+ "मैं इस गाइड का उपयोग कैसे शुरू करूँ?",
69
+ "यह दस्तावेज़ क्या कवर करता है?"
70
+ ]
71
+
72
+ # Build candidate titles from TOC
73
  titles = []
74
  for sec, raw_title in toc:
75
  title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
76
  title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
77
  if 4 < len(title) < 120:
78
  titles.append(title)
79
+
80
  context_sample = " ".join(chunks[:3])[:4000]
 
 
 
81
 
82
+ # Choose language-aware prompt
83
+ if str(doc_lang).startswith("hi"):
84
+ prompt = f"""
85
+ आप एक सामग्री सहायक हैं। नीचे दिए गए तालिका-समाचार (Table of Contents) और दस्तावेज़ के नमूना पाठ के आधार पर 5 से 7 संक्षिप्त, साफ़ और मानवीय प्रश्न बनाइए।
86
+ प्रत्येक प्रश्न हिंदी में होना चाहिए, 18 शब्दों से कम, और प्रश्न चिह्न "?" के साथ समाप्त होना चाहिए। प्रश्न केवल दस्तावेज़ से प्रेरित हों — नई जानकारी इजाद न करें।
87
 
88
+ दस्तावेज़: "{doc_name}"
89
+
90
+ TABLE OF CONTENTS:
91
+ {chr(10).join(['- ' + t for t in titles[:8]])}
92
+
93
+ SAMPLE TEXT:
94
+ {context_sample}
95
+
96
+ आउटपुट: हर प्रश्न को नई लाइन पर लिखें, किसी भी क्रम चिन्ह के साथ (1., -, •) चलेगा। केवल प्रश्न लिखें।
97
+ """
98
+ else:
99
+ prompt = f"""
100
+ You are a content assistant. Based on the Table of Contents and the sample document text below, generate 5–7 short, natural user-facing questions.
101
+ Each question should be in English, <18 words, and end with a question mark.
102
+ Document: "{doc_name}"
103
+
104
+ TABLE OF CONTENTS:
105
+ {chr(10).join(['- ' + t for t in titles[:8]])}
106
+
107
+ SAMPLE TEXT:
108
+ {context_sample}
109
+
110
+ Output: Put one question per line. Do not invent facts — base questions on the document.
111
+ """
112
 
 
 
 
113
  try:
114
  ai_response = genai_generate(prompt)
115
+
116
+ # Normalize response to lines
117
+ lines = [ln.strip() for ln in ai_response.splitlines() if ln.strip()]
118
+
119
+ # Heuristics to extract candidate questions
120
+ candidates = []
121
+ for ln in lines:
122
+ # remove bullet/ordinal prefixes like "1.", "-", "•"
123
+ ln_clean = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
124
+
125
+ # if line already ends with a question mark, keep it
126
+ if ln_clean.endswith("?"):
127
+ q = ln_clean
128
+ else:
129
+ # sometimes model returns without "?" but as a question — add "?" if short and starts with W/H or Hindi question words
130
+ if (len(ln_clean.split()) < 18) and re.match(r"(?i)^(what|how|why|where|who|when|which)\b", ln_clean):
131
+ q = ln_clean + "?"
132
+ # Hindi question words heuristic
133
+ elif re.match(r"^(क्या|क्यों|कैसे|कहाँ|कौन|किस|कब)\b", ln_clean):
134
+ q = ln_clean if ln_clean.endswith("?") else ln_clean + "?"
135
+ else:
136
+ # skip lines that don't look like questions
137
+ continue
138
+
139
+ # length/filter
140
+ q = q.strip()
141
+ if 8 <= len(q) <= 140:
142
+ candidates.append(q)
143
+
144
+ # dedupe while preserving order
145
+ seen = set()
146
+ final = []
147
+ for q in candidates:
148
+ key = q.lower()
149
+ if key not in seen:
150
+ seen.add(key)
151
  final.append(q)
152
+
153
+ # If we ended up with none, fallback to naive generation from titles
154
+ if not final:
155
+ # form simple question templates from titles
156
+ for t in titles[:7]:
157
+ if str(doc_lang).startswith("hi"):
158
+ cand = t.rstrip(".") + " के बारे में क्या जानना चाहिए?"
159
+ else:
160
+ cand = "What should I know about " + t.rstrip(".") + "?"
161
+ final.append(cand)
162
+ # limit to 7
163
  return final[:7]
164
+
165
+ except Exception as e:
166
+ # graceful bilingual fallback
167
+ if str(doc_lang).startswith("hi"):
168
+ return [
169
+ "इस दस्तावेज़ को कैसे शुरू करूँ?",
170
+ "इस दस्तावेज़ का मुख्य उद्देश्य क्या है?",
171
+ "प्रमुख हिस्से कौन से हैं?"
172
+ ]
173
+ else:
174
+ return ["How do I start using this guide?", "What does this document cover?"]
175
+
176
 
177
  # ==========================================================
178
  # 🎨 STYLING — MINIMAL ENTERPRISE DESIGN
 
309
 
310
  doc_name = os.path.basename(temp_path)
311
  if st.session_state["last_doc"] != doc_name:
312
+ query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name, doc_lang)
313
  st.session_state["query_suggestions_fixed"] = query_suggestions
314
  st.session_state["last_doc"] = doc_name
315
  st.session_state["user_query_input"] = ""