Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 21

Commit

1ffa2bc

verified ·

1 Parent(s): 418ad1d

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +103 -22

src/streamlit_app.py CHANGED Viewed

@@ -55,43 +55,124 @@ def detect_language(text_sample: str) -> str:
 # ==========================================================
-# 🧠 SMART SUGGESTION GENERATOR
 # ==========================================================
-def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
     if not toc or not chunks:
-        return []
     titles = []
     for sec, raw_title in toc:
         title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
         title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
         if 4 < len(title) < 120:
             titles.append(title)
     context_sample = " ".join(chunks[:3])[:4000]
-    prompt = f"""
-    You are generating short, natural, and context-aware questions for users reading "{doc_name}".
-    Use the Table of Contents and some document text for inspiration.
-    TABLE OF CONTENTS:
-    {chr(10).join(['- ' + t for t in titles[:8]])}
-    SAMPLE TEXT:
-    {context_sample}
-    Generate 5–7 clear and human-like questions based strictly on this document.
-    Each should sound natural, under 18 words, and avoid robotic phrasing.
-    """
     try:
         ai_response = genai_generate(prompt)
-        questions = re.findall(r"[-•]?\s*(.+?)\?", ai_response)
-        clean_qs = [q.strip("•-— ").strip() + "?" for q in questions if 8 < len(q) < 120]
-        seen, final = set(), []
-        for q in clean_qs:
-            if q.lower() not in seen:
-                seen.add(q.lower())
                 final.append(q)
         return final[:7]
-    except Exception:
-        return ["How do I start using this guide?", "What does this document cover?"]
 # ==========================================================
 # 🎨 STYLING — MINIMAL ENTERPRISE DESIGN
@@ -228,7 +309,7 @@ else:
         doc_name = os.path.basename(temp_path)
         if st.session_state["last_doc"] != doc_name:
-            query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name)
             st.session_state["query_suggestions_fixed"] = query_suggestions
             st.session_state["last_doc"] = doc_name
             st.session_state["user_query_input"] = ""

 # ==========================================================
+# 🧠 SMART SUGGESTION GENERATOR — bilingual (Hindi + English)
 # ==========================================================
+def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document", doc_lang="en"):
+    """
+    Generates 5-7 short, natural questions from TOC + a sample of chunks.
+    If doc_lang == "hi", the prompt asks the model to return questions in Hindi.
+    """
     if not toc or not chunks:
+        # sensible bilingual fallback
+        return ["How do I start using this guide?", "What does this document cover?"] if doc_lang != "hi" else [
+            "मैं इस गाइड का उपयोग कैसे शुरू करूँ?",
+            "यह दस्तावेज़ क्या कवर करता है?"
+        ]
+    # Build candidate titles from TOC
     titles = []
     for sec, raw_title in toc:
         title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
         title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
         if 4 < len(title) < 120:
             titles.append(title)
     context_sample = " ".join(chunks[:3])[:4000]
+    # Choose language-aware prompt
+    if str(doc_lang).startswith("hi"):
+        prompt = f"""
+आप एक सामग्री सहायक हैं। नीचे दिए गए तालिका-समाचार (Table of Contents) और दस्तावेज़ के नमूना पाठ के आधार पर 5 से 7 संक्षिप्त, साफ़ और मानवीय प्रश्न बनाइए।
+प्रत्येक प्रश्न हिंदी में होना चाहिए, 18 शब्दों से कम, और प्रश्न चिह्न "?" के साथ समाप्त होना चाहिए। प्रश्न केवल दस्तावेज़ से प्रेरित हों — नई जानकारी इजाद न करें।
+दस्तावेज़: "{doc_name}"
+TABLE OF CONTENTS:
+{chr(10).join(['- ' + t for t in titles[:8]])}
+SAMPLE TEXT:
+{context_sample}
+आउटपुट: हर प्रश्न को नई लाइन पर लिखें, किसी भी क्रम चिन्ह के साथ (1., -, •) चलेगा। केवल प्रश्न लिखें।
+"""
+    else:
+        prompt = f"""
+You are a content assistant. Based on the Table of Contents and the sample document text below, generate 5–7 short, natural user-facing questions.
+Each question should be in English, <18 words, and end with a question mark.
+Document: "{doc_name}"
+TABLE OF CONTENTS:
+{chr(10).join(['- ' + t for t in titles[:8]])}
+SAMPLE TEXT:
+{context_sample}
+Output: Put one question per line. Do not invent facts — base questions on the document.
+"""
     try:
         ai_response = genai_generate(prompt)
+        # Normalize response to lines
+        lines = [ln.strip() for ln in ai_response.splitlines() if ln.strip()]
+        # Heuristics to extract candidate questions
+        candidates = []
+        for ln in lines:
+            # remove bullet/ordinal prefixes like "1.", "-", "•"
+            ln_clean = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
+            # if line already ends with a question mark, keep it
+            if ln_clean.endswith("?"):
+                q = ln_clean
+            else:
+                # sometimes model returns without "?" but as a question — add "?" if short and starts with W/H or Hindi question words
+                if (len(ln_clean.split()) < 18) and re.match(r"(?i)^(what|how|why|where|who|when|which)\b", ln_clean):
+                    q = ln_clean + "?"
+                # Hindi question words heuristic
+                elif re.match(r"^(क्या|क्यों|कैसे|कहाँ|कौन|किस|कब)\b", ln_clean):
+                    q = ln_clean if ln_clean.endswith("?") else ln_clean + "?"
+                else:
+                    # skip lines that don't look like questions
+                    continue
+            # length/filter
+            q = q.strip()
+            if 8 <= len(q) <= 140:
+                candidates.append(q)
+        # dedupe while preserving order
+        seen = set()
+        final = []
+        for q in candidates:
+            key = q.lower()
+            if key not in seen:
+                seen.add(key)
                 final.append(q)
+        # If we ended up with none, fallback to naive generation from titles
+        if not final:
+            # form simple question templates from titles
+            for t in titles[:7]:
+                if str(doc_lang).startswith("hi"):
+                    cand = t.rstrip(".") + " के बारे में क्या जानना चाहिए?"
+                else:
+                    cand = "What should I know about " + t.rstrip(".") + "?"
+                final.append(cand)
+        # limit to 7
         return final[:7]
+    except Exception as e:
+        # graceful bilingual fallback
+        if str(doc_lang).startswith("hi"):
+            return [
+                "इस दस्तावेज़ को कैसे शुरू करूँ?",
+                "इस दस्तावेज़ का मुख्य उद्देश्य क्या है?",
+                "प्रमुख हिस्से कौन से हैं?"
+            ]
+        else:
+            return ["How do I start using this guide?", "What does this document cover?"]
 # ==========================================================
 # 🎨 STYLING — MINIMAL ENTERPRISE DESIGN
         doc_name = os.path.basename(temp_path)
         if st.session_state["last_doc"] != doc_name:
+            query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name, doc_lang)
             st.session_state["query_suggestions_fixed"] = query_suggestions
             st.session_state["last_doc"] = doc_name
             st.session_state["user_query_input"] = ""