Shubham170793 commited on
Commit
418ad1d
Β·
verified Β·
1 Parent(s): 954cf7f

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +7 -20
src/streamlit_app.py CHANGED
@@ -6,14 +6,6 @@ import re
6
  import streamlit as st
7
  import torch
8
 
9
- # ==========================================================
10
- # 🌐 Language Detection Imports
11
- # ==========================================================
12
- from fasttext_langdetect import detect_langs
13
- from langdetect import detect
14
-
15
-
16
-
17
  # ==========================================================
18
  # βœ… PAGE CONFIGS
19
  # ==========================================================
@@ -40,26 +32,21 @@ from vectorstore import build_faiss_index
40
  from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
41
 
42
  # ==========================================================
43
- # 🧠 LANGUAGE DETECTION HELPER (Improved for Hindi PDFs)
44
  # ==========================================================
45
- import re
46
- from langdetect import detect # keep as fallback
47
 
48
  def detect_language(text_sample: str) -> str:
49
  """
50
- Quick robust detection:
51
- - If Devanagari chars present β†’ Hindi (hi)
52
- - Else fallback to langdetect (which needs real text to be accurate)
53
  """
54
  try:
55
- # Fast deterministic check for Devanagari (Hindi) chars
56
  if re.search(r"[\u0900-\u097F]", text_sample):
57
  return "hi"
58
 
59
- # Some other Indic scripts? you can add more ranges similarly
60
- # e.g. Bengali \u0980-\u09FF ; Tamil \u0B80-\u0BFF etc.
61
-
62
- # Fallback to langdetect for everything else
63
  lang = detect(text_sample)
64
  return "hi" if lang.startswith("hi") else "en"
65
  except Exception:
@@ -230,7 +217,7 @@ else:
230
 
231
  # 🌐 Detect document language (robust multilingual)
232
  doc_sample = " ".join(chunks[:3])[:3000]
233
- doc_lang = detect_document_language(doc_sample)
234
  st.session_state["doc_lang"] = doc_lang
235
  lang_label = "Hindi" if doc_lang.startswith("hi") else "English"
236
  st.caption(f"🈹 Detected document language: {lang_label}")
 
6
  import streamlit as st
7
  import torch
8
 
 
 
 
 
 
 
 
 
9
  # ==========================================================
10
  # βœ… PAGE CONFIGS
11
  # ==========================================================
 
32
  from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
33
 
34
  # ==========================================================
35
+ # 🧠 LANGUAGE DETECTION HELPER (Fast, No Dependencies)
36
  # ==========================================================
37
+ from langdetect import detect
 
38
 
39
  def detect_language(text_sample: str) -> str:
40
  """
41
+ Detects Hindi (Devanagari) or English.
42
+ Returns "hi" for Hindi and "en" for English.
 
43
  """
44
  try:
45
+ # Quick Unicode-based detection for Hindi
46
  if re.search(r"[\u0900-\u097F]", text_sample):
47
  return "hi"
48
 
49
+ # Fallback to langdetect
 
 
 
50
  lang = detect(text_sample)
51
  return "hi" if lang.startswith("hi") else "en"
52
  except Exception:
 
217
 
218
  # 🌐 Detect document language (robust multilingual)
219
  doc_sample = " ".join(chunks[:3])[:3000]
220
+ doc_lang = detect_language(doc_sample)
221
  st.session_state["doc_lang"] = doc_lang
222
  lang_label = "Hindi" if doc_lang.startswith("hi") else "English"
223
  st.caption(f"🈹 Detected document language: {lang_label}")