Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

App Files Files Community

Shubham170793 commited on Oct 21

Commit

51344d2

verified ·

1 Parent(s): 65116ce

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +26 -123

src/streamlit_app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # ==========================================================
-# streamlit_app.py — Stable Layout + Multilingual Enhancement (Hindi + English)
 # ==========================================================
 import os
 import re
@@ -32,44 +32,15 @@ from vectorstore import build_faiss_index
 from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
 # ==========================================================
-# 🧠 LANGUAGE DETECTION HELPER (Fast, No Dependencies)
 # ==========================================================
-from langdetect import detect
-def detect_language(text_sample: str) -> str:
     """
-    Detects Hindi (Devanagari) or English.
-    Returns "hi" for Hindi and "en" for English.
-    """
-    try:
-        # Quick Unicode-based detection for Hindi
-        if re.search(r"[\u0900-\u097F]", text_sample):
-            return "hi"
-        # Fallback to langdetect
-        lang = detect(text_sample)
-        return "hi" if lang.startswith("hi") else "en"
-    except Exception:
-        return "en"
-# ==========================================================
-# 🧠 SMART SUGGESTION GENERATOR — bilingual (Hindi + English)
-# ==========================================================
-def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document", doc_lang="en"):
-    """
-    Generates 5-7 short, natural questions from TOC + a sample of chunks.
-    If doc_lang == "hi", the prompt asks the model to return questions in Hindi.
     """
     if not toc or not chunks:
-        # sensible bilingual fallback
-        return ["How do I start using this guide?", "What does this document cover?"] if doc_lang != "hi" else [
-            "मैं इस गाइड का उपयोग कैसे शुरू करूँ?",
-            "यह दस्तावेज़ क्या कवर करता है?"
-        ]
-    # Build candidate titles from TOC
     titles = []
     for sec, raw_title in toc:
         title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
@@ -78,27 +49,10 @@ def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document", doc_
             titles.append(title)
     context_sample = " ".join(chunks[:3])[:4000]
-    # Choose language-aware prompt
-    if str(doc_lang).startswith("hi"):
-        prompt = f"""
-आप एक सामग्री सहायक हैं। नीचे दिए गए तालिका-समाचार (Table of Contents) और दस्तावेज़ के नमूना पाठ के आधार पर 5 से 7 संक्षिप्त, साफ़ और मानवीय प्रश्न बनाइए।
-प्रत्येक प्रश्न हिंदी में होना चाहिए, 18 शब्दों से कम, और प्रश्न चिह्न "?" के साथ समाप्त होना चाहिए। प्रश्न केवल दस्तावेज़ से प्रेरित हों — नई जानकारी इजाद न करें।
-दस्तावेज़: "{doc_name}"
-TABLE OF CONTENTS:
-{chr(10).join(['- ' + t for t in titles[:8]])}
-SAMPLE TEXT:
-{context_sample}
-आउटपुट: हर प्रश्न को नई लाइन पर लिखें, किसी भी क्रम चिन्ह के साथ (1., -, •) चलेगा। केवल प्रश्न लिखें।
-"""
-    else:
-        prompt = f"""
-You are a content assistant. Based on the Table of Contents and the sample document text below, generate 5–7 short, natural user-facing questions.
-Each question should be in English, <18 words, and end with a question mark.
 Document: "{doc_name}"
 TABLE OF CONTENTS:
@@ -107,72 +61,31 @@ TABLE OF CONTENTS:
 SAMPLE TEXT:
 {context_sample}
-Output: Put one question per line. Do not invent facts — base questions on the document.
 """
     try:
         ai_response = genai_generate(prompt)
-        # Normalize response to lines
         lines = [ln.strip() for ln in ai_response.splitlines() if ln.strip()]
-        # Heuristics to extract candidate questions
-        candidates = []
         for ln in lines:
-            # remove bullet/ordinal prefixes like "1.", "-", "•"
-            ln_clean = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
-            # if line already ends with a question mark, keep it
-            if ln_clean.endswith("?"):
-                q = ln_clean
-            else:
-                # sometimes model returns without "?" but as a question — add "?" if short and starts with W/H or Hindi question words
-                if (len(ln_clean.split()) < 18) and re.match(r"(?i)^(what|how|why|where|who|when|which)\b", ln_clean):
-                    q = ln_clean + "?"
-                # Hindi question words heuristic
-                elif re.match(r"^(क्या|क्यों|कैसे|कहाँ|कौन|किस|कब)\b", ln_clean):
-                    q = ln_clean if ln_clean.endswith("?") else ln_clean + "?"
-                else:
-                    # skip lines that don't look like questions
-                    continue
-            # length/filter
-            q = q.strip()
             if 8 <= len(q) <= 140:
-                candidates.append(q)
-        # dedupe while preserving order
-        seen = set()
         final = []
-        for q in candidates:
-            key = q.lower()
-            if key not in seen:
-                seen.add(key)
                 final.append(q)
-        # If we ended up with none, fallback to naive generation from titles
         if not final:
-            # form simple question templates from titles
-            for t in titles[:7]:
-                if str(doc_lang).startswith("hi"):
-                    cand = t.rstrip(".") + " के बारे में क्या जानना चाहिए?"
-                else:
-                    cand = "What should I know about " + t.rstrip(".") + "?"
-                final.append(cand)
-        # limit to 7
         return final[:7]
-    except Exception as e:
-        # graceful bilingual fallback
-        if str(doc_lang).startswith("hi"):
-            return [
-                "इस दस्तावेज़ को कैसे शुरू करूँ?",
-                "इस दस्तावेज़ का मुख्य उद्देश्य क्या है?",
-                "प्रमुख हिस्से कौन से हैं?"
-            ]
-        else:
-            return ["How do I start using this guide?", "What does this document cover?"]
 # ==========================================================
 # 🎨 STYLING — MINIMAL ENTERPRISE DESIGN
@@ -254,7 +167,6 @@ for key, val in {
     "selected_suggestion": None,
     "query_suggestions_fixed": None,
     "last_doc": None,
-    "doc_lang": "en",  # 🆕 store document language
 }.items():
     if key not in st.session_state:
         st.session_state[key] = val
@@ -262,7 +174,7 @@ for key, val in {
 def set_user_query(q, idx):
     st.session_state["user_query_input"] = q
     st.session_state["selected_suggestion"] = idx
-    st.experimental_rerun()
 # ==========================================================
 # 📄 MAIN SECTION
@@ -296,20 +208,13 @@ else:
             text, toc, toc_source = extract_text_from_pdf(temp_path)
             chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
-            # 🌐 Detect document language (robust multilingual)
-            doc_sample = " ".join(chunks[:3])[:3000]
-            doc_lang = detect_language(doc_sample)
-            st.session_state["doc_lang"] = doc_lang
-            lang_label = "Hindi" if doc_lang.startswith("hi") else "English"
-            st.caption(f"🈹 Detected document language: {lang_label}")
         with st.spinner("⚙️ Building search index..."):
             embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
             index = build_faiss_index(embeddings)
         doc_name = os.path.basename(temp_path)
         if st.session_state["last_doc"] != doc_name:
-            query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name, doc_lang)
             st.session_state["query_suggestions_fixed"] = query_suggestions
             st.session_state["last_doc"] = doc_name
             st.session_state["user_query_input"] = ""
@@ -345,9 +250,7 @@ else:
             reasoning_mode = mode == "Extended (Document + general)"
             with st.spinner("💭 Generating your answer..."):
                 retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
-                doc_lang = st.session_state.get("doc_lang", "en")
-                print("🧠 Document language used for GPT prompt:", doc_lang)
-                answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode, doc_lang=doc_lang)
             st.markdown("### 🤖 Assistant’s Answer")

 # ==========================================================
+# streamlit_app.py — Stable Layout (English Only)
 # ==========================================================
 import os
 import re
 from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
 # ==========================================================
+# 🧠 SMART SUGGESTION GENERATOR (English Only)
 # ==========================================================
+def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
     """
+    Generates 5–7 short, natural English questions based on TOC and document text.
     """
     if not toc or not chunks:
+        return ["How do I start using this guide?", "What does this document cover?"]
     titles = []
     for sec, raw_title in toc:
         title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
             titles.append(title)
     context_sample = " ".join(chunks[:3])[:4000]
+    prompt = f"""
+You are a content assistant. Based on the Table of Contents and the sample document text below,
+generate 5–7 short, natural user-facing questions.
+Each question should be under 18 words, end with a question mark, and sound human.
 Document: "{doc_name}"
 TABLE OF CONTENTS:
 SAMPLE TEXT:
 {context_sample}
+Output: Write each question on a new line. Do not invent facts — base questions only on the document.
 """
     try:
         ai_response = genai_generate(prompt)
         lines = [ln.strip() for ln in ai_response.splitlines() if ln.strip()]
+        questions = []
         for ln in lines:
+            q = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
+            if not q.endswith("?") and len(q.split()) < 18 and re.match(r"(?i)^(what|how|why|where|who|when|which|can|does|is|are)\b", q):
+                q += "?"
             if 8 <= len(q) <= 140:
+                questions.append(q)
+        # dedupe
         final = []
+        seen = set()
+        for q in questions:
+            if q.lower() not in seen:
+                seen.add(q.lower())
                 final.append(q)
         if not final:
+            final = [f"What should I know about {t.rstrip('.')}?" for t in titles[:7]]
         return final[:7]
+    except Exception:
+        return ["How do I start using this guide?", "What does this document cover?"]
 # ==========================================================
 # 🎨 STYLING — MINIMAL ENTERPRISE DESIGN
     "selected_suggestion": None,
     "query_suggestions_fixed": None,
     "last_doc": None,
 }.items():
     if key not in st.session_state:
         st.session_state[key] = val
 def set_user_query(q, idx):
     st.session_state["user_query_input"] = q
     st.session_state["selected_suggestion"] = idx
+    st.rerun()
 # ==========================================================
 # 📄 MAIN SECTION
             text, toc, toc_source = extract_text_from_pdf(temp_path)
             chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
         with st.spinner("⚙️ Building search index..."):
             embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
             index = build_faiss_index(embeddings)
         doc_name = os.path.basename(temp_path)
         if st.session_state["last_doc"] != doc_name:
+            query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name)
             st.session_state["query_suggestions_fixed"] = query_suggestions
             st.session_state["last_doc"] = doc_name
             st.session_state["user_query_input"] = ""
             reasoning_mode = mode == "Extended (Document + general)"
             with st.spinner("💭 Generating your answer..."):
                 retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
+                answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
             st.markdown("### 🤖 Assistant’s Answer")