# ==========================================================
# streamlit_app.py — Stable Layout + Multilingual Enhancement (Hindi + English)
# ==========================================================
import os
import re
import streamlit as st
import torch
# ==========================================================
# ✅ PAGE CONFIGS
# ==========================================================
st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
print("CUDA available:", torch.cuda.is_available())
# ==========================================================
# ⚙️ CACHE SETUP
# ==========================================================
CACHE_DIR = "/tmp/hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
os.environ.update({
"HF_HOME": CACHE_DIR,
"TRANSFORMERS_CACHE": CACHE_DIR,
"HF_DATASETS_CACHE": CACHE_DIR,
"HF_MODULES_CACHE": CACHE_DIR,
})
# ==========================================================
# 📦 IMPORTS
# ==========================================================
from ingestion import extract_text_from_pdf, chunk_text
from vectorstore import build_faiss_index
from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
# ==========================================================
# 🧠 LANGUAGE DETECTION HELPER (Fast, No Dependencies)
# ==========================================================
from langdetect import detect
def detect_language(text_sample: str) -> str:
"""
Detects Hindi (Devanagari) or English.
Returns "hi" for Hindi and "en" for English.
"""
try:
# Quick Unicode-based detection for Hindi
if re.search(r"[\u0900-\u097F]", text_sample):
return "hi"
# Fallback to langdetect
lang = detect(text_sample)
return "hi" if lang.startswith("hi") else "en"
except Exception:
return "en"
# ==========================================================
# 🧠 SMART SUGGESTION GENERATOR — bilingual (Hindi + English)
# ==========================================================
def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document", doc_lang="en"):
"""
Generates 5-7 short, natural questions from TOC + a sample of chunks.
If doc_lang == "hi", the prompt asks the model to return questions in Hindi.
"""
if not toc or not chunks:
# sensible bilingual fallback
return ["How do I start using this guide?", "What does this document cover?"] if doc_lang != "hi" else [
"मैं इस गाइड का उपयोग कैसे शुरू करूँ?",
"यह दस्तावेज़ क्या कवर करता है?"
]
# Build candidate titles from TOC
titles = []
for sec, raw_title in toc:
title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
if 4 < len(title) < 120:
titles.append(title)
context_sample = " ".join(chunks[:3])[:4000]
# Choose language-aware prompt
if str(doc_lang).startswith("hi"):
prompt = f"""
आप एक सामग्री सहायक हैं। नीचे दिए गए तालिका-समाचार (Table of Contents) और दस्तावेज़ के नमूना पाठ के आधार पर 5 से 7 संक्षिप्त, साफ़ और मानवीय प्रश्न बनाइए।
प्रत्येक प्रश्न हिंदी में होना चाहिए, 18 शब्दों से कम, और प्रश्न चिह्न "?" के साथ समाप्त होना चाहिए। प्रश्न केवल दस्तावेज़ से प्रेरित हों — नई जानकारी इजाद न करें।
दस्तावेज़: "{doc_name}"
TABLE OF CONTENTS:
{chr(10).join(['- ' + t for t in titles[:8]])}
SAMPLE TEXT:
{context_sample}
आउटपुट: हर प्रश्न को नई लाइन पर लिखें, किसी भी क्रम चिन्ह के साथ (1., -, •) चलेगा। केवल प्रश्न लिखें।
"""
else:
prompt = f"""
You are a content assistant. Based on the Table of Contents and the sample document text below, generate 5–7 short, natural user-facing questions.
Each question should be in English, <18 words, and end with a question mark.
Document: "{doc_name}"
TABLE OF CONTENTS:
{chr(10).join(['- ' + t for t in titles[:8]])}
SAMPLE TEXT:
{context_sample}
Output: Put one question per line. Do not invent facts — base questions on the document.
"""
try:
ai_response = genai_generate(prompt)
# Normalize response to lines
lines = [ln.strip() for ln in ai_response.splitlines() if ln.strip()]
# Heuristics to extract candidate questions
candidates = []
for ln in lines:
# remove bullet/ordinal prefixes like "1.", "-", "•"
ln_clean = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
# if line already ends with a question mark, keep it
if ln_clean.endswith("?"):
q = ln_clean
else:
# sometimes model returns without "?" but as a question — add "?" if short and starts with W/H or Hindi question words
if (len(ln_clean.split()) < 18) and re.match(r"(?i)^(what|how|why|where|who|when|which)\b", ln_clean):
q = ln_clean + "?"
# Hindi question words heuristic
elif re.match(r"^(क्या|क्यों|कैसे|कहाँ|कौन|किस|कब)\b", ln_clean):
q = ln_clean if ln_clean.endswith("?") else ln_clean + "?"
else:
# skip lines that don't look like questions
continue
# length/filter
q = q.strip()
if 8 <= len(q) <= 140:
candidates.append(q)
# dedupe while preserving order
seen = set()
final = []
for q in candidates:
key = q.lower()
if key not in seen:
seen.add(key)
final.append(q)
# If we ended up with none, fallback to naive generation from titles
if not final:
# form simple question templates from titles
for t in titles[:7]:
if str(doc_lang).startswith("hi"):
cand = t.rstrip(".") + " के बारे में क्या जानना चाहिए?"
else:
cand = "What should I know about " + t.rstrip(".") + "?"
final.append(cand)
# limit to 7
return final[:7]
except Exception as e:
# graceful bilingual fallback
if str(doc_lang).startswith("hi"):
return [
"इस दस्तावेज़ को कैसे शुरू करूँ?",
"इस दस्तावेज़ का मुख्य उद्देश्य क्या है?",
"प्रमुख हिस्से कौन से हैं?"
]
else:
return ["How do I start using this guide?", "What does this document cover?"]
# ==========================================================
# 🎨 STYLING — MINIMAL ENTERPRISE DESIGN
# ==========================================================
st.markdown("""
""", unsafe_allow_html=True)
# ==========================================================
# 🧭 SIDEBAR
# ==========================================================
with st.sidebar:
st.markdown("### 🧭 Response Style")
mode = st.radio(
"",
("Strict (Document-only)", "Extended (Document + general)"),
index=0,
help="Strict = answers only from the uploaded document. Extended = may include related general info.",
)
st.markdown("---")
show_dev = st.checkbox("Show advanced settings (for developers)", value=False)
if show_dev:
st.markdown("### ⚙️ Developer Options")
chunk_size = st.slider("Chunk Size", 200, 1500, 1000, step=50)
overlap = st.slider("Chunk Overlap", 50, 200, 120, step=10)
top_k = st.slider("Top K Results", 1, 10, 7)
else:
chunk_size, overlap, top_k = 1000, 120, 5
st.markdown("---")
st.caption("✨ Built by Shubham Sharma")
# ==========================================================
# 🧠 SESSION STATE
# ==========================================================
for key, val in {
"user_query_input": "",
"show_more": False,
"selected_suggestion": None,
"query_suggestions_fixed": None,
"last_doc": None,
"doc_lang": "en", # 🆕 store document language
}.items():
if key not in st.session_state:
st.session_state[key] = val
def set_user_query(q, idx):
st.session_state["user_query_input"] = q
st.session_state["selected_suggestion"] = idx
st.experimental_rerun()
# ==========================================================
# 📄 MAIN SECTION
# ==========================================================
st.title("📄 Enterprise Knowledge Assistant")
st.caption("Query SAP documentation and enterprise PDFs — powered by reasoning and retrieval.")
doc_choice = st.radio("Select a document:", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=0)
# ==========================================================
# 📂 DOCUMENT HANDLING
# ==========================================================
if doc_choice == "-- Select --":
st.info("⬅️ Select or upload a document to begin.")
else:
if doc_choice == "Sample PDF":
temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
st.success("📘 Sample document loaded successfully — you can start asking your questions below.")
else:
uploaded_file = st.file_uploader("", type="pdf", label_visibility="collapsed")
if uploaded_file:
temp_path = os.path.join("/tmp", uploaded_file.name)
with open(temp_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.success("✅ Document processed successfully — you can start asking your questions below.")
else:
temp_path = None
if temp_path:
with st.spinner("🔍 Processing document..."):
text, toc, toc_source = extract_text_from_pdf(temp_path)
chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
# 🌐 Detect document language (robust multilingual)
doc_sample = " ".join(chunks[:3])[:3000]
doc_lang = detect_language(doc_sample)
st.session_state["doc_lang"] = doc_lang
lang_label = "Hindi" if doc_lang.startswith("hi") else "English"
st.caption(f"🈹 Detected document language: {lang_label}")
with st.spinner("⚙️ Building search index..."):
embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
index = build_faiss_index(embeddings)
doc_name = os.path.basename(temp_path)
if st.session_state["last_doc"] != doc_name:
query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name, doc_lang)
st.session_state["query_suggestions_fixed"] = query_suggestions
st.session_state["last_doc"] = doc_name
st.session_state["user_query_input"] = ""
st.session_state["selected_suggestion"] = None
st.session_state["show_more"] = False
st.rerun()
else:
query_suggestions = st.session_state["query_suggestions_fixed"]
# ----------------------------------------------------------
# 💬 ASK SECTION
# ----------------------------------------------------------
st.markdown("### 💬 Ask the Assistant")
if query_suggestions:
visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
cols = st.columns(min(3, len(visible)))
for i, q in enumerate(visible):
if cols[i % 3].button(f"💬 {q}", key=f"sugg_{i}"):
set_user_query(q, i)
toggle_text = "Show less ▲" if st.session_state["show_more"] else "Show more ▼"
if st.button(toggle_text, help="Show or hide more suggestions"):
st.session_state["show_more"] = not st.session_state["show_more"]
st.rerun()
user_query = st.text_input("Type your question or click one above:", key="user_query_input")
# ----------------------------------------------------------
# 💡 RESPONSE SECTION
# ----------------------------------------------------------
if user_query.strip():
reasoning_mode = mode == "Extended (Document + general)"
with st.spinner("💭 Generating your answer..."):
retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
doc_lang = st.session_state.get("doc_lang", "en")
print("🧠 Document language used for GPT prompt:", doc_lang)
answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode, doc_lang=doc_lang)
st.markdown("### 🤖 Assistant’s Answer")
if not reasoning_mode and not answer.startswith("⚠️"):
answer = re.sub(r"\*\*(.*?)\*\*", r"\1", answer)
answer = re.sub(r"(^|\n)-\s*", r"\1
• ", answer)
st.markdown(f"