Shubham170793 commited on
Commit
69b92ed
Β·
verified Β·
1 Parent(s): b2596ee

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +16 -6
src/streamlit_app.py CHANGED
@@ -5,7 +5,7 @@ import os
5
  import re
6
  import streamlit as st
7
  import torch
8
- from langdetect import detect # πŸ†• Added for language detection
9
 
10
  # ==========================================================
11
  # βœ… PAGE CONFIGS
@@ -33,19 +33,29 @@ from vectorstore import build_faiss_index
33
  from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
34
 
35
  # ==========================================================
36
- # 🧠 LANGUAGE DETECTION HELPER
37
  # ==========================================================
 
 
38
  def detect_language(text_sample: str) -> str:
39
- """Detects whether text is Hindi (hi) or English (en)."""
 
 
 
40
  try:
41
- lang = detect(text_sample)
42
- if lang.startswith("hi"):
 
 
 
 
43
  return "hi"
44
  else:
45
  return "en"
46
- except:
47
  return "en"
48
 
 
49
  # ==========================================================
50
  # 🧠 SMART SUGGESTION GENERATOR
51
  # ==========================================================
 
5
  import re
6
  import streamlit as st
7
  import torch
8
+
9
 
10
  # ==========================================================
11
  # βœ… PAGE CONFIGS
 
33
  from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
34
 
35
  # ==========================================================
36
+ # 🧠 LANGUAGE DETECTION HELPER (Improved for Hindi PDFs)
37
  # ==========================================================
38
+ import re
39
+
40
  def detect_language(text_sample: str) -> str:
41
+ """
42
+ Detect whether the document is primarily Hindi (Devanagari script) or English.
43
+ More reliable than langdetect for mixed or scanned PDFs.
44
+ """
45
  try:
46
+ # Count Devanagari and Latin characters
47
+ devanagari_chars = len(re.findall(r'[\u0900-\u097F]', text_sample))
48
+ latin_chars = len(re.findall(r'[A-Za-z]', text_sample))
49
+
50
+ # Decide dominant language
51
+ if devanagari_chars > latin_chars * 2:
52
  return "hi"
53
  else:
54
  return "en"
55
+ except Exception:
56
  return "en"
57
 
58
+
59
  # ==========================================================
60
  # 🧠 SMART SUGGESTION GENERATOR
61
  # ==========================================================