Spaces:

aaporosh
/

SmartPDF_Q_A

Sleeping

App Files Files Community

aaporosh commited on Aug 20

Commit

56d0815

verified ·

1 Parent(s): 058a20c

Update app.py

Browse files

Files changed (1) hide show

app.py +413 -399

app.py CHANGED Viewed

@@ -2,410 +2,424 @@ import streamlit as st
 import logging
 import os
 from io import BytesIO
-import pdfplumber
-from PIL import Image
-import pytesseract
-from langchain.text_splitter import CharacterTextSplitter
-from langchain_community.vectorstores import FAISS
-from sentence_transformers import SentenceTransformer
-from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
-from datasets import load_dataset
-from rank_bm25 import BM25Okapi
-from rouge_score import rouge_scorer
 import re
 import time
-# Setup logging for Spaces
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# Lazy load models
-@st.cache_resource(ttl=1800)
-def load_embeddings_model():
-    logger.info("Loading embeddings model")
-    try:
-        return SentenceTransformer("all-MiniLM-L6-v2")
-    except Exception as e:
-        logger.error(f"Embeddings load error: {str(e)}")
-        st.error(f"Embedding model error: {str(e)}")
-        return None
-@st.cache_resource(ttl=1800)
-def load_qa_pipeline():
-    logger.info("Loading QA pipeline")
-    try:
-        dataset = load_and_prepare_dataset()
-        if dataset:
-            fine_tuned_pipeline = fine_tune_qa_model(dataset)
-            if fine_tuned_pipeline:
-                return fine_tuned_pipeline
-        return pipeline("text2text-generation", model="google/flan-t5-small", max_length=300)
-    except Exception as e:
-        logger.error(f"QA model load error: {str(e)}")
-        st.error(f"QA model error: {str(e)}")
-        return None
-@st.cache_resource(ttl=1800)
-def load_summary_pipeline():
-    logger.info("Loading summary pipeline")
-    try:
-        return pipeline("summarization", model="facebook/bart-large-cnn", max_length=250)
-    except Exception as e:
-        logger.error(f"Summary model load error: {str(e)}")
-        st.error(f"Summary model error: {str(e)}")
-        return None
-# Load and prepare dataset (e.g., SQuAD)
-@st.cache_data(ttl=3600)
-def load_and_prepare_dataset(dataset_name="squad", max_samples=1000):
-    logger.info(f"Loading dataset: {dataset_name}")
-    try:
-        dataset = load_dataset(dataset_name, split="train[:80%]")
-        dataset = dataset.shuffle(seed=42).select(range(min(max_samples, len(dataset))))
-        def preprocess(examples):
-            inputs = [f"question: {q} context: {c}" for q, c in zip(examples['question'], examples['context'])]
-            targets = examples['answers']['text']
-            return {'input_text': inputs, 'target_text': [t[0] if t else "" for t in targets]}
-        dataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)
-        return dataset
-    except Exception as e:
-        logger.error(f"Dataset load error: {str(e)}")
-        return None
-# Fine-tune QA model
-@st.cache_resource(ttl=3600)
-def fine_tune_qa_model(dataset):
-    logger.info("Starting fine-tuning")
-    try:
-        model_name = "google/flan-t5-small"
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-        def tokenize_function(examples):
-            model_inputs = tokenizer(examples['input_text'], max_length=512, truncation=True, padding="max_length")
-            labels = tokenizer(examples['target_text'], max_length=128, truncation=True, padding="max_length")
-            model_inputs["labels"] = labels["input_ids"]
-            return model_inputs
-        tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['input_text', 'target_text'])
-        training_args = TrainingArguments(
-            output_dir="./fine_tuned_model",
-            num_train_epochs = 2,
-            per_device_train_batch_size=4,
-            save_steps=500,
-            logging_steps=100,
-            evaluation_strategy="no",
-            learning_rate=3e-5,
-            fp16=False,
-        )
-        trainer = Trainer(
-            model=model,
-            args=training_args,
-            train_dataset=tokenized_dataset,
-        )
-        trainer.train()
-        model.save_pretrained("./fine_tuned_model")
-        tokenizer.save_pretrained("./fine_tuned_model")
-        logger.info("Fine-tuning complete")
-        return pipeline("text2text-generation", model="./fine_tuned_model", tokenizer="./fine_tuned_model", max_length=300)
-    except Exception as e:
-        logger.error(f"Fine-tuning error: {str(e)}")
-        return None
-# Augment vector store with dataset
-def augment_vector_store(vector_store, dataset_name="squad", max_samples=300):
-    logger.info(f"Augmenting vector store with dataset: {dataset_name}")
-    try:
-        dataset = load_dataset(dataset_name, split="train").select(range(min(max_samples, len(dataset))))
-        chunks = [f"Context: {c}\nAnswer: {a['text'][0]}" for c, a in zip(dataset['context'], dataset['answers'])]
-        embeddings_model = load_embeddings_model()
-        if embeddings_model and vector_store:
-            embeddings = embeddings_model.encode(chunks, batch_size=128, show_progress_bar=False)
-            vector_store.add_embeddings(zip(chunks, embeddings))
-        return vector_store
-    except Exception as e:
-        logger.error(f"Vector store augmentation error: {str(e)}")
-        return vector_store
-# Process PDF with enhanced extraction and OCR fallback
-def process_pdf(uploaded_file):
-    logger.info("Processing PDF with enhanced extraction")
-    try:
-        text = ""
-        code_blocks = []
-        with pdfplumber.open(BytesIO(uploaded_file.getvalue())) as pdf:
-            for page in pdf.pages[:8]:
-                extracted = page.extract_text(layout=False)
-                if not extracted:
-                    try:
-                        img = page.to_image(resolution=150).original
-                        extracted = pytesseract.image_to_string(img, config='--psm 6')
-                    except Exception as ocr_e:
-                        logger.warning(f"OCR failed: {str(ocr_e)}")
-                if extracted:
-                    # Clean text: remove headers/footers (simple heuristic)
-                    lines = extracted.split("\n")
-                    cleaned_lines = [line for line in lines if not re.match(r'^\s*(Page \d+|.*\d{4}-\d{4}|Copyright.*)\s*$', line, re.I)]
-                    text += "\n".join(cleaned_lines) + "\n"
-                for char in page.chars:
-                    if 'fontname' in char and 'mono' in char['fontname'].lower():
-                        code_blocks.append(char['text'])
-                code_text = page.extract_text()
-                code_matches = re.finditer(r'(^\s{2,}.*?(?:\n\s{2,}.*?)*)', code_text, re.MULTILINE)
-                for match in code_matches:
-                    code_blocks.append(match.group().strip())
-                tables = page.extract_tables()
-                if tables:
-                    for table in tables:
-                        text += "\n".join([" | ".join(map(str, row)) for row in table if row]) + "\n"
-                for obj in page.extract_words():
-                    if obj.get('size', 0) > 12:
-                        text += f"\n{obj['text']}\n"
-        code_text = "\n".join(code_blocks).strip()
-        if not text:
-            raise ValueError("No text extracted from PDF")
-        text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=250, chunk_overlap=40, keep_separator=True)
-        text_chunks = text_splitter.split_text(text)[:25]
-        code_chunks = text_splitter.split_text(code_text)[:10] if code_text else []
-        embeddings_model = load_embeddings_model()
-        if not embeddings_model:
-            return None, None, text, code_text
-        text_vector_store = FAISS.from_embeddings(
-            zip(text_chunks, [embeddings_model.encode(chunk, show_progress_bar=False, batch_size=128) for chunk in text_chunks]),
-            embeddings_model.encode
-        ) if text_chunks else None
-        code_vector_store = FAISS.from_embeddings(
-            zip(code_chunks, [embeddings_model.encode(chunk, show_progress_bar=False, batch_size=128) for chunk in code_chunks]),
-            embeddings_model.encode
-        ) if code_chunks else None
-        if text_vector_store:
-            text_vector_store = augment_vector_store(text_vector_store)
-        logger.info("PDF processed successfully")
-        return text_vector_store, code_vector_store, text, code_text
-    except Exception as e:
-        logger.error(f"PDF processing error: {str(e)}")
-        st.error(f"PDF error: {str(e)}")
-        return None, None, "", ""
-# Summarize PDF with ROUGE metrics and improved topic focus
-def summarize_pdf(text):
-    logger.info("Generating summary")
-    try:
-        summary_pipeline = load_summary_pipeline()
-        if not summary_pipeline:
-            return "Summary model unavailable."
-        text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=250, chunk_overlap=40)
-        chunks = text_splitter.split_text(text)
-        # Hybrid search for relevant chunks
-        embeddings_model = load_embeddings_model()
-        if embeddings_model and chunks:
-            temp_vector_store = FAISS.from_embeddings(
-                zip(chunks, [embeddings_model.encode(chunk, show_progress_bar=False) for chunk in chunks]),
-                embeddings_model.encode
-            )
-            bm25 = BM25Okapi([chunk.split() for chunk in chunks])
-            query = "main topic and key points"
-            bm25_docs = bm25.get_top_n(query.split(), chunks, n=4)
-            faiss_docs = temp_vector_store.similarity_search(query, k=4)
-            selected_chunks = list(set(bm25_docs + [doc.page_content for doc in faiss_docs]))[:4]
         else:
-            selected_chunks = chunks[:4]
-        summaries = []
-        for chunk in selected_chunks:
-            summary = summary_pipeline(f"Summarize the main topic and key points in detail: {chunk[:250]}", max_length=100, min_length=50, do_sample=False)[0]['summary_text']
-            summaries.append(summary.strip())
-        combined_summary = " ".join(summaries)
-        if len(combined_summary.split()) > 250:
-            combined_summary = " ".join(combined_summary.split()[:250])
-        word_count = len(combined_summary.split())
-        scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
-        scores = scorer.score(text[:500], combined_summary)
-        logger.info(f"ROUGE scores: {scores}")
-        return f"**Main Topic Summary** ({word_count} words):\n{combined_summary}\n\n**ROUGE-1**: {scores['rouge1'].fmeasure:.2f}"
-    except Exception as e:
-        logger.error(f"Summary error: {str(e)}")
-        return f"Oops, something went wrong summarizing: {str(e)}"
-# Answer question with hybrid search
-def answer_question(text_vector_store, code_vector_store, query):
-    logger.info(f"Processing query: {query}")
-    try:
-        if not text_vector_store and not code_vector_store:
-            return "Please upload a PDF first!"
-        qa_pipeline = load_qa_pipeline()
-        if not qa_pipeline:
-            return "Sorry, the QA model is unavailable right now."
-        is_code_query = any(keyword in query.lower() for keyword in ["code", "script", "function", "programming", "give me code", "show code"])
-        if is_code_query and code_vector_store:
-            docs = code_vector_store.similarity_search(query, k=3)
-            code = "\n".join(doc.page_content for doc in docs)
-            explanation = qa_pipeline(f"Explain this code: {code[:500]}")[0]['generated_text']
-            return f"**Code**:\n```python\n{code}\n```\n**Explanation**:\n{explanation}"
-        vector_store = text_vector_store
-        if not vector_store:
-            return "No relevant content found for your query."
-        # Hybrid search: FAISS + BM25
-        text_chunks = [doc.page_content for doc in vector_store.similarity_search(query, k=10)]
-        bm25 = BM25Okapi([chunk.split() for chunk in text_chunks])
-        bm25_docs = bm25.get_top_n(query.split(), text_chunks, n=5)
-        faiss_docs = vector_store.similarity_search(query, k=5)
-        combined_docs = list(set(bm25_docs + [doc.page_content for doc in faiss_docs]))[:5]
-        context = "\n".join(combined_docs)
-        prompt = f"Use the following PDF content to answer the question accurately and concisely. Avoid speculation and focus on the provided context:\n\n{context}\n\nQuestion: {query}\nAnswer:"
-        response = qa_pipeline(prompt)[0]['generated_text']
-        logger.info("Answer generated")
-        return f"**Answer**:\n{response.strip()}\n\n**Source Context**:\n{context[:500]}..."
-    except Exception as e:
-        logger.error(f"Query error: {str(e)}")
-        return f"Sorry, something went wrong: {str(e)}"
-# Streamlit UI
-try:
-    st.set_page_config(page_title="Smart PDF Q&A", page_icon="📄", layout="wide")
-    st.markdown("""
-        <style>
-        .main { max-width: 900px; margin: 0 auto; padding: 20px; }
-        .sidebar { background-color: #f8f9fa; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
-        .chat-container { border: 1px solid #ddd; border-radius: 12px; padding: 15px; height: 60vh; overflow-y: auto; margin-top: 20px; background-color: #fafafa; }
-        .stChatMessage { border-radius: 12px; padding: 12px; margin: 8px; max-width: 75%; transition: all 0.3s ease; }
-        .user { background-color: #e6f3ff; align-self: flex-end; border: 1px solid #b3d4fc; }
-        .assistant { background-color: #f0f0f0; border: 1px solid #ccc; }
-        .dark .user { background-color: #2a2a72; color: #fff; border: 1px solid #4a4ab2; }
-        .dark .assistant { background-color: #2e2e2e; color: #fff; border: 1px solid #4a4a4a; }
-        .stButton>button { background-color: #4CAF50; color: white; border: none; padding: 10px 20px; border-radius: 8px; font-weight: bold; }
-        .stButton>button:hover { background-color: #45a049; transform: scale(1.05); }
-        pre { background-color: #f8f8f8; padding: 12px; border-radius: 8px; overflow-x: auto; }
-        .header { background: linear-gradient(90deg, #4CAF50, #81C784); color: white; padding: 15px; border-radius: 8px; text-align: center; box-shadow: 0 2px 4px rgba(0,0,0,0.2); }
-        .progress-bar { background-color: #e0e0e0; border-radius: 5px; height: 10px; }
-        .progress-fill { background-color: #4CAF50; height: 100%; border-radius: 5px; transition: width 0.5s ease; }
-        </style>
-    """, unsafe_allow_html=True)
-    st.markdown('<div class="header"><h1>Smart PDF Q&A</h1></div>', unsafe_allow_html=True)
-    st.markdown("Upload a PDF to ask questions, get a ~200-word summary, or extract code with 'give me code'. Optimized for speed and accuracy!")
-    # Initialize session state
-    if "messages" not in st.session_state:
-        st.session_state.messages = []
-    if "text_vector_store" not in st.session_state:
-        st.session_state.text_vector_store = None
-    if "code_vector_store" not in st.session_state:
-        st.session_state.code_vector_store = None
-    if "pdf_text" not in st.session_state:
-        st.session_state.pdf_text = ""
-    if "code_text" not in st.session_state:
-        st.session_state.code_text = ""
-    # Sidebar with controls
-    with st.sidebar:
-        st.markdown('<div class="sidebar">', unsafe_allow_html=True)
-        theme = st.radio("Theme", ["Light", "Dark"], index=0)
-        dataset_name = st.selectbox("Select Dataset for Fine-Tuning", ["squad", "cnn_dailymail", "bigcode/the-stack"], index=0)
-        if st.button("Fine-Tune Model"):
-            progress_bar = st.progress(0)
-            for i in range(100):
-                time.sleep(0.008)
-                progress_bar.progress(i + 1)
-            dataset = load_and_prepare_dataset(dataset_name=dataset_name)
-            if dataset:
-                fine_tuned_pipeline = fine_tune_qa_model(dataset)
-                if fine_tuned_pipeline:
-                    st.success("Model fine-tuned successfully!")
-                else:
-                    st.error("Fine-tuning failed.")
-        if st.button("Clear Chat"):
-            st.session_state.messages = []
             st.experimental_rerun()
-        if st.button("Retry Summarization") and st.session_state.pdf_text:
-            progress_bar = st.progress(0)
-            with st.spinner("Retrying summarization..."):
-                for i in range(100):
-                    time.sleep(0.008)
-                    progress_bar.progress(i + 1)
-                summary = summarize_pdf(st.session_state.pdf_text)
-                st.session_state.messages.append({"role": "assistant", "content": summary})
-                st.markdown(summary, unsafe_allow_html=True)
-        st.markdown('</div>', unsafe_allow_html=True)
-    # PDF upload and processing
-    uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
-    col1, col2 = st.columns([1, 1])
-    with col1:
-        if st.button("Process PDF"):
-            progress_bar = st.progress(0)
-            with st.spinner("Processing PDF..."):
-                for i in range(100):
-                    time.sleep(0.02)
-                    progress_bar.progress(i + 1)
-                st.session_state.text_vector_store, st.session_state.code_vector_store, st.session_state.pdf_text, st.session_state.code_text = process_pdf(uploaded_file)
-                if st.session_state.text_vector_store or st.session_state.code_vector_store:
-                    st.success("PDF processed! Ask away or summarize.")
-                    st.session_state.messages = []
-                else:
-                    st.error("Failed to process PDF.")
-    with col2:
-        if st.button("Summarize PDF") and st.session_state.pdf_text:
-            progress_bar = st.progress(0)
-            with st.spinner("Summarizing..."):
-                for i in range(100):
-                    time.sleep(0.008)
-                    progress_bar.progress(i + 1)
-                summary = summarize_pdf(st.session_state.pdf_text)
-                st.session_state.messages.append({"role": "assistant", "content": summary})
-                st.markdown(summary, unsafe_allow_html=True)
-    # Chat interface
-    st.markdown('<div class="chat-container">', unsafe_allow_html=True)
-    if st.session_state.text_vector_store or st.session_state.code_vector_store:
-        prompt = st.chat_input("Ask a question (e.g., 'Give me code' or 'What’s the main idea?'):")
-        if prompt:
-            st.session_state.messages.append({"role": "user", "content": prompt})
             with st.chat_message("user"):
-                st.markdown(prompt)
             with st.chat_message("assistant"):
-                progress_bar = st.progress(0)
-                with st.spinner('<div class="spinner">⏳ Processing...</div>'):
-                    for i in range(100):
-                        time.sleep(0.004)
-                        progress_bar.progress(i + 1)
-                    answer = answer_question(st.session_state.text_vector_store, st.session_state.code_vector_store, prompt)
-                st.markdown(answer, unsafe_allow_html=True)
-            st.session_state.messages.append({"role": "assistant", "content": answer})
-    # Display chat history
-    for message in st.session_state.messages:
-        with st.chat_message(message["role"]):
-            st.markdown(message["content"], unsafe_allow_html=True)
-    st.markdown('</div>', unsafe_allow_html=True)
-    # Download chat history
-    if st.session_state.messages:
-        chat_text = "\n".join(f"{m['role'].capitalize()}: {m['content']}" for m in st.session_state.messages)
-        st.download_button("Download Chat History", chat_text, "chat_history.txt")
-except Exception as e:
-    logger.error(f"App initialization failed: {str(e)}")
-    st.error(f"App failed to start: {str(e)}. Check Spaces logs or contact support.")

 import logging
 import os
 from io import BytesIO
 import re
 import time
+from typing import List, Tuple, Optional
+import pdfplumber
+# Optional OCR (guarded)
+try:
+    import pytesseract
+    OCR_AVAILABLE = True
+except Exception:
+    OCR_AVAILABLE = False
+from rank_bm25 import BM25Okapi
+# Embeddings + Vector store
+from sentence_transformers import SentenceTransformer
+import numpy as np
+try:
+    import faiss  # direct FAISS for speed and control
+    FAISS_OK = True
+except Exception:
+    FAISS_OK = False
+# Lightweight HF pipelines
+from transformers import pipeline
+# ----------------------------
+# App & Logging Setup
+# ----------------------------
+st.set_page_config(page_title="Smart PDF Chat & Summarizer", page_icon="📄", layout="wide")
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger("smart_pdf")
+# ----------------------------
+# Caching: models & utilities
+# ----------------------------
+@st.cache_resource(show_spinner=False)
+def get_embedder(name: str = "sentence-transformers/all-MiniLM-L6-v2"):
+    return SentenceTransformer(name)
+@st.cache_resource(show_spinner=False)
+def get_qa_pipeline():
+    # Small, fast instruction model
+    return pipeline(
+        "text2text-generation",
+        model="google/flan-t5-small",
+        device=-1,
+        max_length=220
+    )
+@st.cache_resource(show_spinner=False)
+def get_summarizer():
+    # DistilBART is much faster than bart-large-cnn
+    return pipeline(
+        "summarization",
+        model="sshleifer/distilbart-cnn-12-6",
+        device=-1,
+        max_length=220,
+        min_length=80,
+        do_sample=False,
+    )
+# ----------------------------
+# PDF processing
+# ----------------------------
+def _looks_like_code(line: str) -> bool:
+    if len(line.strip()) == 0:
+        return False
+    # Heuristics for code-y lines
+    code_tokens = [
+        r"\b(def|class|import|from|return|if|elif|else|for|while|try|except|finally|with)\b",
+        r"[{}`;<>]|::|=>|#|//|/\*|\*/",
+        r"\(|\)|\[|\]|\{|\}",
+    ]
+    matches = sum(bool(re.search(p, line)) for p in code_tokens)
+    indent = len(line) - len(line.lstrip())
+    return matches >= 1 or indent >= 4
+def extract_text_and_code_from_pdf(file_bytes: bytes, ocr_fallback: bool = True, max_pages: int = 50) -> Tuple[str, List[str]]:
+    """Return (plain_text, code_blocks[]) from a PDF with simple OCR fallback."""
+    text_parts: List[str] = []
+    code_lines: List[str] = []
+    with pdfplumber.open(BytesIO(file_bytes)) as pdf:
+        pages = pdf.pages[:max_pages]
+        for page in pages:
+            # 1) Try text extraction
+            extracted = page.extract_text(x_tolerance=1.5, y_tolerance=1.0) or ""
+            # 2) OCR fallback if page empty and OCR available
+            if not extracted.strip() and ocr_fallback and OCR_AVAILABLE:
+                try:
+                    img = page.to_image(resolution=180).original
+                    extracted = pytesseract.image_to_string(img, config='--psm 6') or ""
+                except Exception as e:
+                    logger.warning(f"OCR failed on a page: {e}")
+            # 3) Clean and collect
+            if extracted:
+                # Remove common headers/footers by simple rules
+                lines = [ln for ln in extracted.splitlines() if not re.match(r"^(Page\s*\d+|Copyright.*)$", ln, flags=re.I)]
+                text_parts.append("\n".join(lines))
+                # Code detection: fenced blocks first
+                fenced = re.findall(r"```[\w-]*\n([\s\S]*?)```", extracted, flags=re.M)
+                for blk in fenced:
+                    blk = blk.strip()
+                    if blk:
+                        code_lines.append(blk)
+                # Otherwise, line-wise heuristic
+                for ln in lines:
+                    if _looks_like_code(ln):
+                        code_lines.append(ln)
+            # 4) Tables -> pipe-separated rows
+            try:
+                tables = page.extract_tables() or []
+                for tb in tables:
+                    for row in tb:
+                        if row and any(str(c).strip() for c in row):
+                            text_parts.append(" | ".join(str(c).strip() for c in row))
+            except Exception:
+                pass
+    full_text = "\n\n".join(tp for tp in text_parts if tp.strip())
+    # Merge adjacent code lines into blocks
+    code_blocks: List[str] = []
+    if code_lines:
+        current: List[str] = []
+        for ln in code_lines:
+            if ln.strip():
+                current.append(ln)
+            else:
+                if current:
+                    code_blocks.append("\n".join(current))
+                    current = []
+        if current:
+            code_blocks.append("\n".join(current))
+    # Deduplicate & trim giant blocks
+    seen = set()
+    unique_blocks = []
+    for blk in code_blocks:
+        key = blk.strip()
+        if key and key not in seen:
+            seen.add(key)
+            # cap extreme long blocks for UI; still allow download of full
+            unique_blocks.append(blk[:8000])
+    return full_text, unique_blocks
+# ----------------------------
+# Chunking & Indexing
+# ----------------------------
+def chunk_text(text: str, chunk_size: int = 700, chunk_overlap: int = 120) -> List[str]:
+    text = re.sub(r"\n{3,}", "\n\n", text).strip()
+    paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
+    chunks: List[str] = []
+    buf: str = ""
+    for para in paras:
+        if not buf:
+            buf = para
+        elif len(buf) + len(para) + 1 <= chunk_size:
+            buf += "\n" + para
         else:
+            chunks.append(buf)
+            # overlap
+            overlap = buf[-chunk_overlap:] if chunk_overlap > 0 else ""
+            buf = (overlap + "\n" + para).strip()
+    if buf:
+        chunks.append(buf)
+    return chunks
+@st.cache_resource(show_spinner=False)
+def build_indexes(chunks: List[str]):
+    embedder = get_embedder()
+    matrix = embedder.encode(chunks, show_progress_bar=False, batch_size=64, normalize_embeddings=True)
+    matrix = np.asarray(matrix).astype('float32')
+    bm25 = BM25Okapi([c.split() for c in chunks])
+    if FAISS_OK:
+        index = faiss.IndexFlatIP(matrix.shape[1])
+        index.add(matrix)
+        return {
+            "chunks": chunks,
+            "embeddings": matrix,
+            "faiss": index,
+            "bm25": bm25,
+        }
+    else:
+        # Fallback: cosine via numpy (slower but OK for small docs)
+        return {
+            "chunks": chunks,
+            "embeddings": matrix,
+            "faiss": None,
+            "bm25": bm25,
+        }
+# ----------------------------
+# Retrieval + QA
+# ----------------------------
+def retrieve(topk: int, query: str, idx):
+    chunks = idx["chunks"]
+    embeddings = idx["embeddings"]
+    bm25 = idx["bm25"]
+    # BM25
+    bm25_docs = bm25.get_top_n(query.split(), chunks, n=min(topk, len(chunks)))
+    # FAISS / cosine
+    embedder = get_embedder()
+    qv = embedder.encode([query], normalize_embeddings=True)[0].astype('float32')
+    if idx["faiss"] is not None:
+        D, I = idx["faiss"].search(np.array([qv]), min(topk, len(chunks)))
+        faiss_docs = [chunks[i] for i in I[0]]
+    else:
+        # cosine with numpy
+        sims = embeddings @ qv
+        order = np.argsort(-sims)[:topk]
+        faiss_docs = [chunks[i] for i in order]
+    # Merge uniques with preference to BM25 then FAISS
+    merged: List[str] = []
+    seen = set()
+    for c in bm25_docs + faiss_docs:
+        if c not in seen:
+            merged.append(c)
+            seen.add(c)
+        if len(merged) >= topk:
+            break
+    return merged
+def rag_answer(query: str, idx, max_ctx_chars: int = 3000) -> str:
+    ctx_chunks = retrieve(6, query, idx)
+    # Concatenate up to a char budget
+    ctx = "\n\n".join(ctx_chunks)
+    if len(ctx) > max_ctx_chars:
+        ctx = ctx[:max_ctx_chars]
+    qa = get_qa_pipeline()
+    prompt = (
+        "Answer the question using ONLY the provided context. "
+        "If the answer is not in the context, say 'I couldn't find that in the PDF.'\n\n"
+        f"Context:\n{ctx}\n\nQuestion: {query}\nAnswer:"
+    )
+    out = qa(prompt)[0]["generated_text"].strip()
+    return out
+def summarize_text(full_text: str) -> str:
+    summarizer = get_summarizer()
+    # Summarize in parts for long docs
+    chunks = chunk_text(full_text, chunk_size=1200, chunk_overlap=150)
+    partials = []
+    for ch in chunks[:8]:  # cap to keep it snappy on CPU
+        partials.append(summarizer(ch)[0]["summary_text"].strip())
+    # Final stitch summary
+    stitched = " ".join(partials)
+    if len(stitched) > 2000:
+        stitched = summarizer(stitched[:3000])[0]["summary_text"].strip()
+    return stitched
+# ----------------------------
+# UI
+# ----------------------------
+st.markdown(
+    """
+    <style>
+      .app-header {background: linear-gradient(90deg,#10b981,#22c55e); color: white; padding: 16px; border-radius: 14px; text-align:center; box-shadow: 0 6px 20px rgba(16,185,129,.25)}
+      .card {border:1px solid #e5e7eb; border-radius: 14px; padding: 16px; background: #fff}
+      .muted {color:#6b7280}
+      .kbd {background:#f3f4f6; border:1px solid #e5e7eb; border-radius:6px; padding:2px 6px; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco}
+    </style>
+    """,
+    unsafe_allow_html=True,
+)
+st.markdown('<div class="app-header"><h1>📄 Smart PDF Chat & Summarizer</h1><p class="muted">Fast answers, focused summaries, and automatic code extraction</p></div>', unsafe_allow_html=True)
+# Session state
+if "idx" not in st.session_state:
+    st.session_state.idx = None
+if "pdf_text" not in st.session_state:
+    st.session_state.pdf_text = ""
+if "code_blocks" not in st.session_state:
+    st.session_state.code_blocks = []
+# Sidebar
+with st.sidebar:
+    st.subheader("Upload & Options")
+    file = st.file_uploader("Upload a PDF", type=["pdf"], help="Max ~50 pages for speed. Uses OCR fallback if needed.")
+    max_pages = st.slider("Max pages to parse", 5, 100, 50, help="Lower = faster")
+    do_ocr = st.toggle("Enable OCR fallback (slower)", value=False)
+    chunk_size = st.slider("Chunk size", 300, 1400, 700, step=50)
+    overlap = st.slider("Chunk overlap", 0, 300, 120, step=10)
+    colA, colB = st.columns(2)
+    with colA:
+        if st.button("⚙️ Build Index", use_container_width=True, type="primary"):
+            if not file:
+                st.warning("Please upload a PDF first.")
+            else:
+                with st.spinner("Reading & indexing PDF…"):
+                    data = file.read()
+                    text, code_blocks = extract_text_and_code_from_pdf(data, ocr_fallback=do_ocr, max_pages=max_pages)
+                    st.session_state.pdf_text = text
+                    st.session_state.code_blocks = code_blocks
+                    if not text.strip():
+                        st.error("Couldn't extract any text from the PDF.")
+                    else:
+                        chunks = chunk_text(text, chunk_size=chunk_size, chunk_overlap=overlap)
+                        st.session_state.idx = build_indexes(chunks)
+                        st.success(f"Indexed {len(chunks)} chunks. Ready!")
+    with colB:
+        if st.button("🧹 Clear", use_container_width=True):
+            st.session_state.idx = None
+            st.session_state.pdf_text = ""
+            st.session_state.code_blocks = []
             st.experimental_rerun()
+    if st.session_state.code_blocks:
+        st.caption("Detected code blocks. You can copy or download from the Summary tab.")
+# Main area — two sections exactly: Chat & Summary
+chat_tab, summary_tab = st.tabs(["💬 Chat", "📝 Summary (with Code)"])
+with chat_tab:
+    st.markdown("<div class='card'>Ask questions about your PDF. Retrieval-augmented answers use only the document context.</div>", unsafe_allow_html=True)
+    if st.session_state.idx is None:
+        st.info("Upload a PDF and click **Build Index** in the sidebar.")
+    else:
+        user_q = st.chat_input("Ask anything about the PDF…")
+        if "chat" not in st.session_state:
+            st.session_state.chat = []
+        # Render history
+        for role, content in st.session_state.get("chat", []):
+            with st.chat_message(role):
+                st.markdown(content)
+        if user_q:
+            st.session_state.chat.append(("user", user_q))
             with st.chat_message("user"):
+                st.markdown(user_q)
             with st.chat_message("assistant"):
+                with st.spinner("Thinking…"):
+                    try:
+                        ans = rag_answer(user_q, st.session_state.idx)
+                    except Exception as e:
+                        ans = f"Sorry, I hit an error while answering: {e}"
+                st.markdown(ans)
+            st.session_state.chat.append(("assistant", ans))
+with summary_tab:
+    st.markdown("<div class='card'>One-click concise summary of the entire document, plus extracted programming code if detected.</div>", unsafe_allow_html=True)
+    col1, col2 = st.columns([1,1])
+    with col1:
+        if st.button("🔎 Summarize PDF", type="primary", use_container_width=True):
+            if not st.session_state.pdf_text.strip():
+                st.warning("No parsed text yet. Upload & Build Index first.")
+            else:
+                with st.spinner("Summarizing…"):
+                    try:
+                        sm = summarize_text(st.session_state.pdf_text)
+                        st.session_state.summary = sm
+                        st.success("Summary generated.")
+                    except Exception as e:
+                        st.error(f"Summarization failed: {e}")
+    with col2:
+        if st.session_state.pdf_text:
+            st.download_button(
+                "⬇️ Download raw extracted text",
+                st.session_state.pdf_text,
+                file_name="extracted_text.txt",
+                use_container_width=True,
+            )
+    if st.session_state.get("summary"):
+        st.subheader("Summary")
+        st.write(st.session_state.summary)
+    st.divider()
+    st.subheader("Extracted Code")
+    if st.session_state.code_blocks:
+        for i, blk in enumerate(st.session_state.code_blocks, start=1):
+            with st.expander(f"Code block #{i}"):
+                st.code(blk, language=None)
+                st.download_button(
+                    f"Download code #{i}",
+                    blk,
+                    file_name=f"code_block_{i}.txt",
+                    key=f"dl_{i}",
+                )
+        all_code = "\n\n\n".join(st.session_state.code_blocks)
+        st.download_button("⬇️ Download all code", all_code, file_name="all_code.txt")
+    else:
+        st.caption("No code-like content detected yet.")
+# Footer tips
+st.markdown(
+    """
+    <div class="muted" style="margin-top:24px">⚡ Tips for faster responses: use smaller PDFs, lower the "Max pages" and "Chunk size" in the sidebar, and keep OCR off unless needed.</div>
+    """,
+    unsafe_allow_html=True,
+)