Shubham170793 commited on
Commit
fa923b0
·
verified ·
1 Parent(s): c0ebdcb

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +14 -10
src/streamlit_app.py CHANGED
@@ -36,26 +36,30 @@ from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks,
36
  # 🧠 LANGUAGE DETECTION HELPER (Improved for Hindi PDFs)
37
  # ==========================================================
38
  import re
 
39
 
40
  def detect_language(text_sample: str) -> str:
41
  """
42
- Detect whether the document is primarily Hindi (Devanagari script) or English.
43
- More reliable than langdetect for mixed or scanned PDFs.
 
44
  """
45
  try:
46
- # Count Devanagari and Latin characters
47
- devanagari_chars = len(re.findall(r'[\u0900-\u097F]', text_sample))
48
- latin_chars = len(re.findall(r'[A-Za-z]', text_sample))
49
-
50
- # Decide dominant language
51
- if devanagari_chars > latin_chars * 2:
52
  return "hi"
53
- else:
54
- return "en"
 
 
 
 
 
55
  except Exception:
56
  return "en"
57
 
58
 
 
59
  # ==========================================================
60
  # 🧠 SMART SUGGESTION GENERATOR
61
  # ==========================================================
 
36
  # 🧠 LANGUAGE DETECTION HELPER (Improved for Hindi PDFs)
37
  # ==========================================================
38
  import re
39
+ from langdetect import detect # keep as fallback
40
 
41
  def detect_language(text_sample: str) -> str:
42
  """
43
+ Quick robust detection:
44
+ - If Devanagari chars present Hindi (hi)
45
+ - Else fallback to langdetect (which needs real text to be accurate)
46
  """
47
  try:
48
+ # Fast deterministic check for Devanagari (Hindi) chars
49
+ if re.search(r"[\u0900-\u097F]", text_sample):
 
 
 
 
50
  return "hi"
51
+
52
+ # Some other Indic scripts? you can add more ranges similarly
53
+ # e.g. Bengali \u0980-\u09FF ; Tamil \u0B80-\u0BFF etc.
54
+
55
+ # Fallback to langdetect for everything else
56
+ lang = detect(text_sample)
57
+ return "hi" if lang.startswith("hi") else "en"
58
  except Exception:
59
  return "en"
60
 
61
 
62
+
63
  # ==========================================================
64
  # 🧠 SMART SUGGESTION GENERATOR
65
  # ==========================================================