Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

Shubham170793 commited on Oct 20

Commit

69b92ed

verified ·

1 Parent(s): b2596ee

Update src/streamlit_app.py

Files changed (1) hide show

src/streamlit_app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os
 import re
 import streamlit as st
 import torch
-from langdetect import detect  # 🆕 Added for language detection
 # ==========================================================
 # ✅ PAGE CONFIGS
@@ -33,19 +33,29 @@ from vectorstore import build_faiss_index
 from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
 # ==========================================================
-# 🧠 LANGUAGE DETECTION HELPER
 # ==========================================================
 def detect_language(text_sample: str) -> str:
-    """Detects whether text is Hindi (hi) or English (en)."""
     try:
-        lang = detect(text_sample)
-        if lang.startswith("hi"):
             return "hi"
         else:
             return "en"
-    except:
         return "en"
 # ==========================================================
 # 🧠 SMART SUGGESTION GENERATOR
 # ==========================================================

 import re
 import streamlit as st
 import torch
 # ==========================================================
 # ✅ PAGE CONFIGS
 from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
 # ==========================================================
+# 🧠 LANGUAGE DETECTION HELPER (Improved for Hindi PDFs)
 # ==========================================================
+import re
 def detect_language(text_sample: str) -> str:
+    """
+    Detect whether the document is primarily Hindi (Devanagari script) or English.
+    More reliable than langdetect for mixed or scanned PDFs.
+    """
     try:
+        # Count Devanagari and Latin characters
+        devanagari_chars = len(re.findall(r'[\u0900-\u097F]', text_sample))
+        latin_chars = len(re.findall(r'[A-Za-z]', text_sample))
+        # Decide dominant language
+        if devanagari_chars > latin_chars * 2:
             return "hi"
         else:
             return "en"
+    except Exception:
         return "en"
 # ==========================================================
 # 🧠 SMART SUGGESTION GENERATOR
 # ==========================================================