Update src/streamlit_app.py
Browse files- src/streamlit_app.py +16 -6
src/streamlit_app.py
CHANGED
|
@@ -5,7 +5,7 @@ import os
|
|
| 5 |
import re
|
| 6 |
import streamlit as st
|
| 7 |
import torch
|
| 8 |
-
|
| 9 |
|
| 10 |
# ==========================================================
|
| 11 |
# β
PAGE CONFIGS
|
|
@@ -33,19 +33,29 @@ from vectorstore import build_faiss_index
|
|
| 33 |
from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
|
| 34 |
|
| 35 |
# ==========================================================
|
| 36 |
-
# π§ LANGUAGE DETECTION HELPER
|
| 37 |
# ==========================================================
|
|
|
|
|
|
|
| 38 |
def detect_language(text_sample: str) -> str:
|
| 39 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 40 |
try:
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
return "hi"
|
| 44 |
else:
|
| 45 |
return "en"
|
| 46 |
-
except:
|
| 47 |
return "en"
|
| 48 |
|
|
|
|
| 49 |
# ==========================================================
|
| 50 |
# π§ SMART SUGGESTION GENERATOR
|
| 51 |
# ==========================================================
|
|
|
|
| 5 |
import re
|
| 6 |
import streamlit as st
|
| 7 |
import torch
|
| 8 |
+
|
| 9 |
|
| 10 |
# ==========================================================
|
| 11 |
# β
PAGE CONFIGS
|
|
|
|
| 33 |
from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
|
| 34 |
|
| 35 |
# ==========================================================
|
| 36 |
+
# π§ LANGUAGE DETECTION HELPER (Improved for Hindi PDFs)
|
| 37 |
# ==========================================================
|
| 38 |
+
import re
|
| 39 |
+
|
| 40 |
def detect_language(text_sample: str) -> str:
|
| 41 |
+
"""
|
| 42 |
+
Detect whether the document is primarily Hindi (Devanagari script) or English.
|
| 43 |
+
More reliable than langdetect for mixed or scanned PDFs.
|
| 44 |
+
"""
|
| 45 |
try:
|
| 46 |
+
# Count Devanagari and Latin characters
|
| 47 |
+
devanagari_chars = len(re.findall(r'[\u0900-\u097F]', text_sample))
|
| 48 |
+
latin_chars = len(re.findall(r'[A-Za-z]', text_sample))
|
| 49 |
+
|
| 50 |
+
# Decide dominant language
|
| 51 |
+
if devanagari_chars > latin_chars * 2:
|
| 52 |
return "hi"
|
| 53 |
else:
|
| 54 |
return "en"
|
| 55 |
+
except Exception:
|
| 56 |
return "en"
|
| 57 |
|
| 58 |
+
|
| 59 |
# ==========================================================
|
| 60 |
# π§ SMART SUGGESTION GENERATOR
|
| 61 |
# ==========================================================
|