Spaces:

aaporosh
/

SmartPDF_Q_A

Sleeping

App Files Files Community

aaporosh commited on Aug 20, 2025

Commit

b561129

verified ·

1 Parent(s): 11694c7

Update app.py

Browse files

Files changed (1) hide show

app.py +406 -146

app.py CHANGED Viewed

@@ -1,157 +1,417 @@
-# ------------- app.py -------------
 import streamlit as st
-from pathlib import Path
 from io import BytesIO
-import pdfplumber, pytesseract, time, re, logging, os
 from PIL import Image
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from sentence_transformers import SentenceTransformer
-from transformers import pipeline
-import numpy as np
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-###############################################################################
-# Page layout
-###############################################################################
-st.set_page_config(page_title="PDF Chat & Summarize", layout="wide")
-st.markdown("""
-<style>
-    .block-container { padding-top: 1rem; padding-bottom: 0; }
-    .stTabs [data-baseweb="tab-list"] { gap: 4px; }
-    .stTabs [data-baseweb="tab"] { padding: 8px 24px; }
-    .chat-msg { padding: 0.5rem 1rem; border-radius: 8px; margin: 0.3rem 0; }
-    .user   { background-color: #e3f2fd; margin-left: 20%; }
-    .assistant { background-color: #f1f3f4; margin-right: 20%; }
-</style>
-""", unsafe_allow_html=True)
-###############################################################################
-# Cached heavy objects
-###############################################################################
-@st.cache_resource(show_spinner=False)
-def load_embed():
-    return SentenceTransformer("all-MiniLM-L6-v2")
-@st.cache_resource(show_spinner=False)
-def load_qa():
-    return pipeline("text2text-generation", model="google/flan-t5-large", max_length=512)
-@st.cache_resource(show_spinner=False)
-def load_sum():
-    return pipeline("summarization", model="facebook/bart-large-cnn", max_length=250)
-embed = load_embed()
-qa_pipe  = load_qa()
-sum_pipe = load_sum()
-###############################################################################
-# Helpers
-###############################################################################
-def extract_pdf(uploaded_file):
-    """Return (plain text, image_list)"""
-    text = ""
-    images = []
-    with pdfplumber.open(BytesIO(uploaded_file.getbuffer())) as pdf:
-        for page in pdf.pages:
-            txt = page.extract_text_layout() or page.extract_text()
-            if not txt:
-                img = page.to_image(resolution=200).original
-                txt = pytesseract.image_to_string(img)
-            text += txt + "\n"
-            for img in page.images:
-                try:
-                    x0, y0, x1, y1 = img["x0"], img["y0"], img["x1"], img["y1"]
-                    pil = page.within_bbox((x0, y0, x1, y1)).to_image(resolution=200).original
-                    images.append(pil)
-                except Exception:
-                    pass
-    return text.strip(), images
-def build_index(text):
-    splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=80)
-    chunks = splitter.split_text(text)
-    vectors = embed.encode(chunks, show_progress_bar=False, batch_size=64)
-    index = FAISS.from_embeddings(list(zip(chunks, vectors)), embed)
-    return index
-def summarize(text):
-    if len(text) < 50:
-        return "Document too short to summarize."
-    # pick top 3k chars to stay within model limit
-    truncated = text[:3000]
-    return sum_pipe(truncated, max_length=250, min_length=60, do_sample=False)[0]["summary_text"]
-def answer(question, index):
-    if index is None:
-        return "Please upload & process a PDF first."
-    docs = index.similarity_search(question, k=4)
-    context = "\n".join([d.page_content for d in docs])
-    prompt = f"Answer the question using ONLY the context below.\n\nContext:\n{context}\n\nQuestion: {question}"
-    return qa_pipe(prompt, max_length=256, do_sample=False)[0]["generated_text"]
-###############################################################################
-# Session init
-###############################################################################
-if "messages" not in st.session_state:
-    st.session_state.messages = []
-if "index" not in st.session_state:
-    st.session_state.index = None
-if "raw_text" not in st.session_state:
-    st.session_state.raw_text = ""
-if "images" not in st.session_state:
-    st.session_state.images = []
-###############################################################################
-# Sidebar
-###############################################################################
-with st.sidebar:
-    st.subheader("📁 Upload PDF")
-    uploaded = st.file_uploader("Choose a file", type="pdf", label_visibility="collapsed")
-    if uploaded and st.button("Process PDF"):
-        with st.spinner("Extracting text & images…"):
-            st.session_state.raw_text, st.session_state.images = extract_pdf(uploaded)
-            st.session_state.index = build_index(st.session_state.raw_text)
             st.session_state.messages = []
-            st.toast("PDF ready!")
     if st.session_state.images:
-        st.subheader("🖼️ Extracted Images")
-        for im in st.session_state.images:
-            st.image(im, use_column_width=True)
-###############################################################################
-# Main Tabs
-###############################################################################
-tab_chat, tab_sum = st.tabs(["💬 Chat", "📄 Summarize"])
-with tab_chat:
-    if st.session_state.index is None:
-        st.info("Upload & process a PDF first using the sidebar.")
-    else:
-        # history
-        for role, msg in st.session_state.messages:
-            css = "user" if role == "user" else "assistant"
-            st.markdown(f'<div class="chat-msg {css}">{msg}</div>', unsafe_allow_html=True)
-        # input
-        if question := st.chat_input("Ask anything about the PDF…"):
-            st.session_state.messages.append(("user", question))
-            st.markdown(f'<div class="chat-msg user">{question}</div>', unsafe_allow_html=True)
-            with st.spinner("Thinking…"):
-                resp = answer(question, st.session_state.index)
-            st.session_state.messages.append(("assistant", resp))
-            st.markdown(f'<div class="chat-msg assistant">{resp}</div>', unsafe_allow_html=True)
-with tab_sum:
-    if not st.session_state.raw_text:
-        st.info("Upload & process a PDF first.")
-    else:
-        if st.button("Generate Summary"):
-            with st.spinner("Summarizing…"):
-                summary = summarize(st.session_state.raw_text)
-            st.subheader("Summary")
-            st.write(summary)

 import streamlit as st
+import logging
+import os
 from io import BytesIO
+import pdfplumber
+from pdf2image import convert_from_bytes
 from PIL import Image
+from langchain.text_splitter import CharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from sentence_transformers import SentenceTransformer
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
+from datasets import load_dataset
+from rank_bm25 import BM25Okapi
+from rouge_score import rouge_scorer
+import re
+import time
+import pytesseract
+# Setup logging for Spaces
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# Lazy load models
+@st.cache_resource(ttl=1800)
+def load_embeddings_model():
+    logger.info("Loading embeddings model")
+    try:
+        return SentenceTransformer("all-MiniLM-L6-v2")
+    except Exception as e:
+        logger.error(f"Embeddings load error: {str(e)}")
+        st.error(f"Embedding model error: {str(e)}")
+        return None
+@st.cache_resource(ttl=1800)
+def load_qa_pipeline():
+    logger.info("Loading QA pipeline")
+    try:
+        dataset = load_and_prepare_dataset()
+        if dataset:
+            fine_tuned_pipeline = fine_tune_qa_model(dataset)
+            if fine_tuned_pipeline:
+                return fine_tuned_pipeline
+        return pipeline("text2text-generation", model="google/flan-t5-small", max_length=300)
+    except Exception as e:
+        logger.error(f"QA model load error: {str(e)}")
+        st.error(f"QA model error: {str(e)}")
+        return None
+@st.cache_resource(ttl=1800)
+def load_summary_pipeline():
+    logger.info("Loading summary pipeline")
+    try:
+        return pipeline("summarization", model="facebook/bart-large-cnn", max_length=250)
+    except Exception as e:
+        logger.error(f"Summary model load error: {str(e)}")
+        st.error(f"Summary model error: {str(e)}")
+        return None
+# Load and prepare dataset (e.g., SQuAD)
+@st.cache_data(ttl=3600)
+def load_and_prepare_dataset(dataset_name="squad", max_samples=1000):
+    logger.info(f"Loading dataset: {dataset_name}")
+    try:
+        dataset = load_dataset(dataset_name, split="train[:80%]")
+        dataset = dataset.shuffle(seed=42).select(range(min(max_samples, len(dataset))))
+        def preprocess(examples):
+            inputs = [f"question: {q} context: {c}" for q, c in zip(examples['question'], examples['context'])]
+            targets = examples['answers']['text']
+            return {'input_text': inputs, 'target_text': [t[0] if t else "" for t in targets]}
+        dataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)
+        return dataset
+    except Exception as e:
+        logger.error(f"Dataset load error: {str(e)}")
+        return None
+# Fine-tune QA model
+@st.cache_resource(ttl=3600)
+def fine_tune_qa_model(dataset):
+    logger.info("Starting fine-tuning")
+    try:
+        model_name = "google/flan-t5-small"
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+        def tokenize_function(examples):
+            model_inputs = tokenizer(examples['input_text'], max_length=512, truncation=True, padding="max_length")
+            labels = tokenizer(examples['target_text'], max_length=128, truncation=True, padding="max_length")
+            model_inputs["labels"] = labels["input_ids"]
+            return model_inputs
+        tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['input_text', 'target_text'])
+        training_args = TrainingArguments(
+            output_dir="./fine_tuned_model",
+            num_train_epochs=2,
+            per_device_train_batch_size=4,
+            save_steps=500,
+            logging_steps=100,
+            evaluation_strategy="no",
+            learning_rate=3e-5,
+            fp16=False,
+        )
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=tokenized_dataset,
+        )
+        trainer.train()
+        model.save_pretrained("./fine_tuned_model")
+        tokenizer.save_pretrained("./fine_tuned_model")
+        logger.info("Fine-tuning complete")
+        return pipeline("text2text-generation", model="./fine_tuned_model", tokenizer="./fine_tuned_model", max_length=300)
+    except Exception as e:
+        logger.error(f"Fine-tuning error: {str(e)}")
+        return None
+# Augment vector store with dataset
+def augment_vector_store(vector_store, dataset_name="squad", max_samples=300):
+    logger.info(f"Augmenting vector store with dataset: {dataset_name}")
+    try:
+        dataset = load_dataset(dataset_name, split="train").select(range(min(max_samples, len(dataset))))
+        chunks = [f"Context: {c}\nAnswer: {a['text'][0]}" for c, a in zip(dataset['context'], dataset['answers'])]
+        embeddings_model = load_embeddings_model()
+        if embeddings_model and vector_store:
+            embeddings = embeddings_model.encode(chunks, batch_size=128, show_progress_bar=False)
+            vector_store.add_embeddings(zip(chunks, embeddings))
+        return vector_store
+    except Exception as e:
+        logger.error(f"Vector store augmentation error: {str(e)}")
+        return vector_store
+# Process PDF with enhanced extraction and OCR fallback
+def process_pdf(uploaded_file):
+    logger.info("Processing PDF with enhanced extraction")
+    try:
+        text = ""
+        code_blocks = []
+        images = []
+        with pdfplumber.open(BytesIO(uploaded_file.getvalue())) as pdf:
+            for page in pdf.pages[:8]:
+                extracted = page.extract_text(layout=False)
+                if not extracted:
+                    try:
+                        img = page.to_image(resolution=150).original
+                        extracted = pytesseract.image_to_string(img, config='--psm 6')
+                        images.append(img)
+                    except Exception as ocr_e:
+                        logger.warning(f"OCR failed: {str(ocr_e)}")
+                if extracted:
+                    lines = extracted.split("\n")
+                    cleaned_lines = [line for line in lines if not re.match(r'^\s*(Page \d+|.*\d{4}-\d{4}|Copyright.*)\s*$', line, re.I)]
+                    text += "\n".join(cleaned_lines) + "\n"
+                for char in page.chars:
+                    if 'fontname' in char and 'mono' in char['fontname'].lower():
+                        code_blocks.append(char['text'])
+                code_text = page.extract_text()
+                code_matches = re.finditer(r'(^\s{2,}.*?(?:\n\s{2,}.*?)*)', code_text, re.MULTILINE)
+                for match in code_matches:
+                    code_blocks.append(match.group().strip())
+                tables = page.extract_tables()
+                if tables:
+                    for table in tables:
+                        text += "\n".join([" | ".join(map(str, row)) for row in table if row]) + "\n"
+                for obj in page.extract_words():
+                    if obj.get('size', 0) > 12:
+                        text += f"\n{obj['text']}\n"
+        code_text = "\n".join(code_blocks).strip()
+        if not text:
+            raise ValueError("No text extracted from PDF")
+        text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=250, chunk_overlap=40, keep_separator=True)
+        text_chunks = text_splitter.split_text(text)[:25]
+        code_chunks = text_splitter.split_text(code_text)[:10] if code_text else []
+        embeddings_model = load_embeddings_model()
+        if not embeddings_model:
+            return None, None, text, code_text, images
+        text_vector_store = FAISS.from_embeddings(
+            zip(text_chunks, [embeddings_model.encode(chunk, show_progress_bar=False, batch_size=128) for chunk in text_chunks]),
+            embeddings_model.encode
+        ) if text_chunks else None
+        code_vector_store = FAISS.from_embeddings(
+            zip(code_chunks, [embeddings_model.encode(chunk, show_progress_bar=False, batch_size=128) for chunk in code_chunks]),
+            embeddings_model.encode
+        ) if code_chunks else None
+        if text_vector_store:
+            text_vector_store = augment_vector_store(text_vector_store)
+        logger.info("PDF processed successfully")
+        return text_vector_store, code_vector_store, text, code_text, images
+    except Exception as e:
+        logger.error(f"PDF processing error: {str(e)}")
+        st.error(f"PDF error: {str(e)}")
+        return None, None, "", "", []
+# Summarize PDF with ROUGE metrics and improved topic focus
+def summarize_pdf(text):
+    logger.info("Generating summary")
+    try:
+        summary_pipeline = load_summary_pipeline()
+        if not summary_pipeline:
+            return "Summary model unavailable."
+        text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=250, chunk_overlap=40)
+        chunks = text_splitter.split_text(text)
+        # Hybrid search for relevant chunks
+        embeddings_model = load_embeddings_model()
+        if embeddings_model and chunks:
+            temp_vector_store = FAISS.from_embeddings(
+                zip(chunks, [embeddings_model.encode(chunk, show_progress_bar=False) for chunk in chunks]),
+                embeddings_model.encode
+            )
+            bm25 = BM25Okapi([chunk.split() for chunk in chunks])
+            query = "main topic and key points"
+            bm25_docs = bm25.get_top_n(query.split(), chunks, n=4)
+            faiss_docs = temp_vector_store.similarity_search(query, k=4)
+            selected_chunks = list(set(bm25_docs + [doc.page_content for doc in faiss_docs]))[:4]
+        else:
+            selected_chunks = chunks[:4]
+        summaries = []
+        for chunk in selected_chunks:
+            summary = summary_pipeline(f"Summarize the main topic and key points in detail: {chunk[:250]}", max_length=100, min_length=50, do_sample=False)[0]['summary_text']
+            summaries.append(summary.strip())
+        combined_summary = " ".join(summaries)
+        if len(combined_summary.split()) > 250:
+            combined_summary = " ".join(combined_summary.split()[:250])
+        word_count = len(combined_summary.split())
+        scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
+        scores = scorer.score(text[:500], combined_summary)
+        logger.info(f"ROUGE scores: {scores}")
+        return f"**Main Topic Summary** ({word_count} words):\n{combined_summary}\n\n**ROUGE-1**: {scores['rouge1'].fmeasure:.2f}"
+    except Exception as e:
+        logger.error(f"Summary error: {str(e)}")
+        return f"Oops, something went wrong summarizing: {str(e)}"
+# Answer question with hybrid search
+def answer_question(text_vector_store, code_vector_store, query):
+    logger.info(f"Processing query: {query}")
+    try:
+        if not text_vector_store and not code_vector_store:
+            return "Please upload a PDF first!"
+        qa_pipeline = load_qa_pipeline()
+        if not qa_pipeline:
+            return "Sorry, the QA model is unavailable right now."
+        is_code_query = any(keyword in query.lower() for keyword in ["code", "script", "function", "programming", "give me code", "show code"])
+        if is_code_query and code_vector_store:
+            docs = code_vector_store.similarity_search(query, k=3)
+            code = "\n".join(doc.page_content for doc in docs)
+            explanation = qa_pipeline(f"Explain this code: {code[:500]}")[0]['generated_text']
+            return f"**Code**:\n```python\n{code}\n```\n**Explanation**:\n{explanation}"
+        vector_store = text_vector_store
+        if not vector_store:
+            return "No relevant content found for your query."
+        # Hybrid search: FAISS + BM25
+        text_chunks = [doc.page_content for doc in vector_store.similarity_search(query, k=10)]
+        bm25 = BM25Okapi([chunk.split() for chunk in text_chunks])
+        bm25_docs = bm25.get_top_n(query.split(), text_chunks, n=5)
+        faiss_docs = vector_store.similarity_search(query, k=5)
+        combined_docs = list(set(bm25_docs + [doc.page_content for doc in faiss_docs]))[:5]
+        context = "\n".join(combined_docs)
+        prompt = f"Use the following PDF content to answer the question accurately and concisely. Avoid speculation and focus on the provided context:\n\n{context}\n\nQuestion: {query}\nAnswer:"
+        response = qa_pipeline(prompt)[0]['generated_text']
+        logger.info("Answer generated")
+        return f"**Answer**:\n{response.strip()}\n\n**Source Context**:\n{context[:500]}..."
+    except Exception as e:
+        logger.error(f"Query error: {str(e)}")
+        return f"Sorry, something went wrong: {str(e)}"
+# Streamlit UI
+try:
+    st.set_page_config(page_title="Smart PDF Q&A", page_icon="📄", layout="wide")
+    st.markdown("""
+        <style>
+        .main { max-width: 900px; margin: 0 auto; padding: 20px; }
+        .sidebar { background-color: #f8f9fa; padding: 10px; border-radius: 5px; }
+        .message { margin: 10px 0; padding: 10px; border-radius: 5px; display: block; }
+        .user { background-color: #e6f3ff; }
+        .assistant { background-color: #f0f0f0; }
+        .dark .user { background-color: #2a2a72; color: #fff; }
+        .dark .assistant { background-color: #2e2e2e; color: #fff; }
+        .stButton>button { background-color: #4CAF50; color: white; border: none; padding: 8px 16px; border-radius: 5px; }
+        .stButton>button:hover { background-color: #45a049; }
+        pre { background-color: #f8f8f8; padding: 10px; border-radius: 5px; overflow-x: auto; }
+        .header { background: linear-gradient(90deg, #4CAF50, #81C784); color: white; padding: 10px; border-radius: 5px; text-align: center; }
+        .progress-bar { background-color: #e0e0e0; border-radius: 5px; height: 10px; }
+        .progress-fill { background-color: #4CAF50; height: 100%; border-radius: 5px; transition: width 0.5s ease; }
+        </style>
+    """, unsafe_allow_html=True)
+    st.markdown('<div class="header"><h1>Smart PDF Q&A</h1></div>', unsafe_allow_html=True)
+    st.markdown("Upload a PDF to ask questions, summarize (~150 words), or extract code with 'give me code'. Fast and friendly responses!")
+    # Initialize session state
+    if "messages" not in st.session_state:
+        st.session_state.messages = [{"role": "assistant", "content": "Hello! Upload a PDF and process it to start chatting."}]
+    if "text_vector_store" not in st.session_state:
+        st.session_state.text_vector_store = None
+    if "code_vector_store" not in st.session_state:
+        st.session_state.code_vector_store = None
+    if "pdf_text" not in st.session_state:
+        st.session_state.pdf_text = ""
+    if "code_text" not in st.session_state:
+        st.session_state.code_text = ""
+    if "images" not in st.session_state:
+        st.session_state.images = []
+    # Sidebar with toggle
+    with st.sidebar:
+        st.markdown('<div class="sidebar">', unsafe_allow_html=True)
+        theme = st.radio("Theme", ["Light", "Dark"], index=0)
+        dataset_name = st.selectbox("Select Dataset for Fine-Tuning", ["squad", "cnn_dailymail", "bigcode/the-stack"], index=0)
+        if st.button("Fine-Tune Model"):
+            progress_bar = st.progress(0)
+            for i in range(100):
+                time.sleep(0.008)
+                progress_bar.progress(i + 1)
+            dataset = load_and_prepare_dataset(dataset_name=dataset_name)
+            if dataset:
+                fine_tuned_pipeline = fine_tune_qa_model(dataset)
+                if fine_tuned_pipeline:
+                    st.success("Model fine-tuned successfully!")
+                else:
+                    st.error("Fine-tuning failed.")
+        if st.button("Clear Chat"):
             st.session_state.messages = []
+            st.experimental_rerun()
+        if st.button("Retry Summarization") and st.session_state.pdf_text:
+            progress_bar = st.progress(0)
+            with st.spinner("Retrying summarization..."):
+                for i in range(100):
+                    time.sleep(0.008)
+                    progress_bar.progress(i + 1)
+                summary = summarize_pdf(st.session_state.pdf_text)
+                st.session_state.messages.append({"role": "assistant", "content": summary})
+                st.markdown(summary, unsafe_allow_html=True)
+        st.markdown('</div>', unsafe_allow_html=True)
+    # PDF upload and processing
+    uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        if st.button("Process PDF"):
+            progress_bar = st.progress(0)
+            with st.spinner("Processing PDF..."):
+                for i in range(100):
+                    time.sleep(0.02)
+                    progress_bar.progress(i + 1)
+                st.session_state.text_vector_store, st.session_state.code_vector_store, st.session_state.pdf_text, st.session_state.code_text, st.session_state.images = process_pdf(uploaded_file)
+                if st.session_state.text_vector_store or st.session_state.code_vector_store:
+                    st.success("PDF processed! Ask away or summarize.")
+                    st.session_state.messages = [{"role": "assistant", "content": "PDF processed! What would you like to know?"}]
+                else:
+                    st.error("Failed to process PDF.")
+    with col2:
+        if st.button("Summarize PDF") and st.session_state.pdf_text:
+            progress_bar = st.progress(0)
+            with st.spinner("Summarizing..."):
+                for i in range(100):
+                    time.sleep(0.008)
+                    progress_bar.progress(i + 1)
+                summary = summarize_pdf(st.session_state.pdf_text)
+                st.session_state.messages.append({"role": "assistant", "content": summary})
+                st.markdown(summary, unsafe_allow_html=True)
+    # Chat interface
+    if st.session_state.text_vector_store or st.session_state.code_vector_store:
+        prompt = st.chat_input("Ask a question (e.g., 'Give me code' or 'What’s the main idea?'):")
+        if prompt:
+            st.session_state.messages.append({"role": "user", "content": prompt})
+            with st.chat_message("user"):
+                st.markdown(prompt)
+            with st.chat_message("assistant"):
+                progress_bar = st.progress(0)
+                with st.spinner('<div class="spinner">⏳ Processing...</div>'):
+                    for i in range(100):
+                        time.sleep(0.004)
+                        progress_bar.progress(i + 1)
+                    answer = answer_question(st.session_state.text_vector_store, st.session_state.code_vector_store, prompt)
+                st.markdown(answer, unsafe_allow_html=True)
+            st.session_state.messages.append({"role": "assistant", "content": answer})
+    # Display chat history
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"], unsafe_allow_html=True)
+    # Display extracted images
     if st.session_state.images:
+        st.header("Extracted Images")
+        for img in st.session_state.images:
+            st.image(img, caption="Extracted PDF Image", use_column_width=True)
+    # Download chat history
+    if st.session_state.messages:
+        chat_text = "\n".join(f"{m['role'].capitalize()}: {m['content']}" for m in st.session_state.messages)
+        st.download_button("Download Chat History", chat_text, "chat_history.txt")
+except Exception as e:
+    logger.error(f"App initialization failed: {str(e)}")
+    st.error(f"App failed to start: {str(e)}. Check Spaces logs or contact support.")