Spaces:

abid-ai
/

AIstudyAssistant

Sleeping

App Files Files Community

abid-ai commited on Dec 18, 2025

Commit

c1e9823

verified ·

1 Parent(s): 66cac5c

Update app.py

Browse files

Files changed (1) hide show

app.py +388 -375

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 """
 AI Study Assistant - Streamlit Application
 Features:
@@ -14,11 +15,23 @@ import os
 import io
 import time
 import base64
 from typing import List, Tuple, Dict, Optional
-import PyPDF2
 import streamlit as st
 # Streamlit page config
 st.set_page_config(page_title="AI Study Assistant", layout="wide", initial_sidebar_state="expanded")
@@ -27,52 +40,52 @@ st.set_page_config(page_title="AI Study Assistant", layout="wide", initial_sideb
 # CSS / Fonts (handwriting)
 # -------------------------
 HANDWRITING_FONTS = [
-   "Patrick Hand",
-   "Caveat",
-   "Indie Flower",
-   "Reenie Beanie"
 ]
 google_fonts = "+".join([f"{f.replace(' ', '+')}:wght@400;700" for f in HANDWRITING_FONTS])
 st.markdown(
-   f"""<link href="https://fonts.googleapis.com/css2?family=Patrick+Hand&family=Caveat&family=Indie+Flower&family=Reenie+Beanie&display=swap" rel="stylesheet">""",
-   unsafe_allow_html=True
 )
 st.markdown(
-   f"""
-   <style>
-     :root {{
-       --handwriting: "{HANDWRITING_FONTS[0]}", "{HANDWRITING_FONTS[1]}", cursive, sans-serif;
-     }}
-     body {{
-       background: linear-gradient(180deg,#fbfbff,#ffffff);
-     }}
-     .handwriting {{
-       font-family: var(--handwriting);
-     }}
-     .mcq-block {{
-       white-space: pre-wrap;
-       font-family: var(--handwriting);
-       padding: 12px;
-       border-radius: 8px;
-       background: #fffdf7;
-       border: 1px solid #f1e6d6;
-     }}
-     .qa-box {{
-       background: #ffffff;
-       border-radius: 8px;
-       padding: 10px;
-       box-shadow: 0 2px 8px rgba(12,12,12,0.05);
-     }}
-     .small-muted {{
-       font-size:12px;color:#6b7280;
-     }}
-     .download-link {{
-       margin-top: 8px;
-     }}
-   </style>
-   """,
-   unsafe_allow_html=True
 )
 # -------------------------
@@ -83,9 +96,9 @@ st.sidebar.title("AI Study Assistant — Settings")
 # API Key input (secure)
 openai_key = st.sidebar.text_input("OpenAI API Key (start with sk-)", type="password", help="Your OpenAI API key. For Spaces add it to Secrets.")
 if openai_key:
-   os.environ["OPENAI_API_KEY"] = openai_key
 elif "OPENAI_API_KEY" in os.environ:
-   openai_key = os.environ.get("OPENAI_API_KEY")
 # Model selection
 model_choice = st.sidebar.selectbox("Generation model", options=["gpt-4", "gpt-4o", "gpt-3.5-turbo"], index=0)
@@ -106,194 +119,194 @@ st.sidebar.markdown("**Tips:** Use PDFs with selectable text for best results. S
 # OpenAI initialization
 # -------------------------
 def ensure_openai_key():
-   key = os.environ.get("OPENAI_API_KEY", None)
-   if not key:
-       raise RuntimeError("OpenAI API key not found. Set it in the sidebar or add OPENAI_API_KEY to environment.")
-   openai.api_key = key
 # -------------------------
 # PDF extraction utilities
 # -------------------------
 @st.cache_data(show_spinner=False)
 def extract_text_pdfplumber(file_bytes: bytes) -> str:
-   """Extract text using pdfplumber (best for most PDFs). Cached to avoid repeated work."""
-   text_pages = []
-   try:
-       with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
-           for p in pdf.pages:
-               txt = p.extract_text()
-               if txt:
-                   text_pages.append(txt)
-   except Exception as e:
-       # Let caller fallback to PyPDF2
-       raise e
-   return "\n\n".join(text_pages).strip()
 @st.cache_data(show_spinner=False)
 def extract_text_pypdf2(file_bytes: bytes) -> str:
-   """Fallback extraction using PyPDF2."""
-   text_pages = []
-   try:
-       reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
-       for page in reader.pages:
-           try:
-               txt = page.extract_text()
-           except Exception:
-               txt = None
-           if txt:
-               text_pages.append(txt)
-   except Exception as e:
-       raise e
-   return "\n\n".join(text_pages).strip()
 def extract_text(file_bytes: bytes) -> str:
-   """Robust extraction: try pdfplumber first, fallback to PyPDF2."""
-   text = ""
-   try:
-       text = extract_text_pdfplumber(file_bytes)
-       if not text:
-           raise ValueError("pdfplumber returned empty text.")
-   except Exception:
-       text = extract_text_pypdf2(file_bytes)
-   return text
 # -------------------------
 # Chunking / embeddings / retrieval
 # -------------------------
 @st.cache_data(show_spinner=False)
 def chunk_text(text: str, words_per_chunk: int = 700, overlap: int = 150) -> List[str]:
-   words = text.split()
-   chunks = []
-   start = 0
-   L = len(words)
-   while start < L:
-       end = min(start + words_per_chunk, L)
-       chunk = " ".join(words[start:end])
-       chunks.append(chunk)
-       start = end - overlap
-       if start < 0:
-           start = 0
-   return chunks
 @st.cache_data(show_spinner=False)
 def get_embeddings(texts: List[str], model: str) -> List[List[float]]:
-   ensure_openai_key()
-   # Batch call to embeddings API
-   resp = openai.Embedding.create(model=model, input=texts)
-   embeddings = [row["embedding"] for row in resp["data"]]
-   return embeddings
 def top_k_chunks(question: str, chunks: List[str], chunk_embs: List[List[float]], k: int = 4, emb_model: str = "text-embedding-3-small"):
-   ensure_openai_key()
-   # compute question embedding
-   q_emb = get_embeddings([question], model=emb_model)[0]
-   sims = cosine_similarity([q_emb], chunk_embs)[0]
-   idx = np.argsort(sims)[-k:][::-1]
-   selected = [chunks[i] for i in idx]
-   return selected, idx
 # -------------------------
 # OpenAI Chat wrappers
 # -------------------------
 def call_chat_completion(messages: List[Dict], model: str = "gpt-3.5-turbo", max_tokens: int = 700, temperature: float = 0.2):
-   ensure_openai_key()
-   try:
-       resp = openai.ChatCompletion.create(
-           model=model,
-           messages=messages,
-           max_tokens=max_tokens,
-           temperature=temperature
-       )
-       return resp["choices"][0]["message"]["content"].strip()
-   except openai.error.OpenAIError as e:
-       raise RuntimeError(f"OpenAI API error: {e}")
 # -------------------------
 # Prompt engineering functions
 # -------------------------
 def generate_summary(full_text: str, model: str = "gpt-4") -> str:
-   """
-   Create a concise but comprehensive summary with headings and key bullets.
-   To reduce tokens we can ask the model to summarize sections first (but here we send full text).
-   """
-   prompt = [
-       {
-           "role": "system",
-           "content": "You are an assistant that summarizes documents for study and revision."
-       },
-       {
-           "role": "user",
-           "content": (
-               "Summarize the following document for exam revision. "
-               "Provide a concise executive summary (3-6 sentences), then key takeaways as bullet points, and a short list of important terms and definitions. "
-               "Use clear headings. Keep the style formal and compact.\n\n"
-               f"Document:\n\n{full_text}"
-           )
-       }
-   ]
-   # Limit tokens to protect cost; large docs may need chunked summarization — user can call again if needed
-   return call_chat_completion(prompt, model=model, max_tokens=900, temperature=0.2)
 def generate_mcqs(full_text: str, model: str = "gpt-4", count: int = 30) -> str:
-   """
-   Generate MCQs formatted consistently. We ask the model to return plaintext in a structured format.
-   """
-   instruction = (
-       f"Create {count} multiple-choice questions (MCQs) based on the document below. "
-       "Each question must have 4 options labeled A, B, C, D and one correct answer. "
-       "Make questions diverse (recall, concept, application). Mark the correct answer on a separate 'Answer:' line. "
-       "Format EXACTLY like this for each question:\n\n"
-       "Question <n>: <question text>\n\n"
-       "    A. <option A>\n"
-       "    B. <option B>\n"
-       "    C. <option C>\n"
-       "    D. <option D>\n\n"
-       "Answer: <LETTER>\n\n"
-       "Do NOT include explanations. Keep each question short and clear."
-   )
-   prompt = [
-       {"role": "system", "content": "You are an experienced instructor who writes high-quality MCQs."},
-       {"role": "user", "content": instruction + "\n\nDocument:\n\n" + full_text}
-   ]
-   return call_chat_completion(prompt, model=model, max_tokens=2200, temperature=0.3)
 def answer_question(question: str, chunks: List[str], chunk_embs: List[List[float]], emb_model: str, gen_model: str, top_k: int = 4) -> str:
-   """
-   Retrieval-augmented answer: pick top_k chunks and ask model to answer using only that context.
-   """
-   selected_chunks, idx = top_k_chunks(question, chunks, chunk_embs, k=top_k, emb_model=emb_model)
-   context = "\n\n---\n\n".join(selected_chunks)
-   prompt = [
-       {"role": "system", "content": "You are an assistant that answers questions using the provided context. If the answer is not in the context, say you could not find it."},
-       {"role": "user", "content": f"Context:\n\n{context}\n\nQuestion: {question}\n\nAnswer concisely and cite which chunk indexes (0-based) you used."}
-   ]
-   return call_chat_completion(prompt, model=gen_model, max_tokens=400, temperature=0.2)
 # -------------------------
 # Download helpers
 # -------------------------
 def make_text_download(content: str, filename: str = "study_package.md"):
-   b64 = base64.b64encode(content.encode()).decode()
-   href = f'<a class="download-link" href="data:text/markdown;base64,{b64}" download="{filename}">Download {filename}</a>'
-   return href
 # -------------------------
 # Session state initialization
 # -------------------------
 if "qa_history" not in st.session_state:
-   st.session_state["qa_history"] = []  # list of dicts: question, answer, time
 if "summary" not in st.session_state:
-   st.session_state["summary"] = None
 if "mcq_text" not in st.session_state:
-   st.session_state["mcq_text"] = None
 if "chunks" not in st.session_state:
-   st.session_state["chunks"] = None
 if "chunk_embeddings" not in st.session_state:
-   st.session_state["chunk_embeddings"] = None
 # -------------------------
 # App UI layout
@@ -305,205 +318,205 @@ st.caption("Upload a PDF and generate a summary, 25+ MCQs, and interactively ask
 left_col, right_col = st.columns([1.4, 2])
 with left_col:
-   st.header("Upload & Settings")
-   uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"], help="Choose a PDF with selectable text for best results.")
-   if uploaded_file:
-       # Read bytes
-       file_bytes = uploaded_file.read()
-       st.write(f"**Filename:** {uploaded_file.name} — {len(file_bytes)//1024} KB")
-       # Try extracting text
-       with st.spinner("Extracting text from PDF..."):
-           try:
-               full_text = extract_text(file_bytes)
-               if not full_text or len(full_text.strip()) < 50:
-                   st.warning("Extracted text is short or empty. The PDF may be scanned images. Try another PDF or enable OCR.")
-               else:
-                   st.success(f"Extracted {len(full_text.split())} words from PDF.")
-                   # Save in session
-                   st.session_state["full_text"] = full_text
-           except Exception as e:
-               st.error(f"Failed to extract text: {e}")
-               st.stop()
-   else:
-       st.info("Please upload a PDF to enable summary and MCQ generation.")
-   # Action buttons
-   st.markdown("---")
-   st.header("Generate Content")
-   colA, colB = st.columns([1,1])
-   with colA:
-       if st.button("Generate Summary"):
-           if not uploaded_file:
-               st.error("Upload a PDF first.")
-           else:
-               try:
-                   with st.spinner("Generating summary (OpenAI)..."):
-                       ensure_openai_key()
-                       # If document is very large, you might want to chunk and summarize iteratively.
-                       summary_text = generate_summary(st.session_state["full_text"], model=model_choice)
-                       st.session_state["summary"] = summary_text
-                       st.success("Summary generated.")
-               except Exception as e:
-                   st.error(f"Summary generation failed: {e}")
-   with colB:
-       if st.button(f"Generate {mcq_target} MCQs"):
-           if not uploaded_file:
-               st.error("Upload a PDF first.")
-           else:
-               try:
-                   with st.spinner("Generating MCQs (this may take a moment)..."):
-                       ensure_openai_key()
-                       mcq_text = generate_mcqs(st.session_state["full_text"], model=model_choice, count=int(mcq_target))
-                       st.session_state["mcq_text"] = mcq_text
-                       st.success("MCQs generated.")
-               except Exception as e:
-                   st.error(f"MCQ generation failed: {e}")
-   # Generate both
-   if st.button("Generate Summary + MCQs"):
-       if not uploaded_file:
-           st.error("Upload a PDF first.")
-       else:
-           try:
-               with st.spinner("Generating summary + MCQs..."):
-                   ensure_openai_key()
-                   st.session_state["summary"] = generate_summary(st.session_state["full_text"], model=model_choice)
-                   st.session_state["mcq_text"] = generate_mcqs(st.session_state["full_text"], model=model_choice, count=int(mcq_target))
-                   st.success("Summary and MCQs generated.")
-           except Exception as e:
-               st.error(f"Combined generation failed: {e}")
-   # Prepare retrieval infrastructure
-   if uploaded_file and ("full_text" in st.session_state):
-       if st.button("Prepare Q&A (create embeddings)"):
-           try:
-               with st.spinner("Chunking document and computing embeddings (costly operation)..."):
-                   chunks = chunk_text(st.session_state["full_text"], words_per_chunk=int(chunk_size), overlap=int(chunk_overlap))
-                   st.session_state["chunks"] = chunks
-                   # Compute embeddings (cached)
-                   chunk_embs = get_embeddings(chunks, model=emb_model_choice)
-                   st.session_state["chunk_embeddings"] = chunk_embs
-                   st.success(f"Prepared {len(chunks)} chunks and embeddings for retrieval.")
-           except Exception as e:
-               st.error(f"Failed to prepare embeddings: {e}")
-   st.markdown("---")
-   st.header("Download / Export")
-   st.markdown("After generating content, download a combined study package.")
-   if st.session_state.get("summary") or st.session_state.get("mcq_text") or st.session_state["qa_history"]:
-       # Compose markdown
-       composed = []
-       if st.session_state.get("summary"):
-           composed.append("# Summary\n\n" + st.session_state["summary"] + "\n\n")
-       if st.session_state.get("mcq_text"):
-           composed.append("# MCQs\n\n" + st.session_state["mcq_text"] + "\n\n")
-       if st.session_state.get("qa_history"):
-           qalist = ["# Q&A History\n"]
-           for qa in st.session_state["qa_history"]:
-               qalist.append(f"**Q:** {qa['question']}\n\n**A:** {qa['answer']}\n\n_Time:_ {qa['time']}\n\n")
-           composed.append("\n".join(qalist))
-       package_md = "\n".join(composed)
-       st.markdown(make_text_download(package_md, filename=f"{uploaded_file.name}_study_package.md"), unsafe_allow_html=True)
-       st.download_button("Download study package (.md)", package_md, file_name=f"{uploaded_file.name}_study_package.md", mime="text/markdown")
-   else:
-       st.info("No generated content yet. Run summary/MCQ generation first.")
 with right_col:
-   # Tabs: Summary, MCQ Quiz, Q&A
-   tab1, tab2, tab3 = st.tabs(["📑 Summary", "📝 MCQ Quiz", "❓ Q&A Dashboard"])
-   with tab1:
-       st.header("Document Summary")
-       if st.session_state.get("summary"):
-           st.markdown("<div class='qa-box handwriting'>", unsafe_allow_html=True)
-           st.markdown(st.session_state["summary"], unsafe_allow_html=True)
-           st.markdown("</div>", unsafe_allow_html=True)
-       else:
-           st.info("No summary yet. Click 'Generate Summary' in the left panel.")
-   with tab2:
-       st.header("Generated MCQs")
-       if st.session_state.get("mcq_text"):
-           # Display with formatting: question line and indented options vertically
-           st.markdown("<div class='mcq-block'>", unsafe_allow_html=True)
-           # We display as preformatted but with handwriting font and indentation
-           st.text_area("MCQs (read-only)", value=st.session_state["mcq_text"], height=420, key="mcq_display")
-           st.markdown("</div>", unsafe_allow_html=True)
-           # Also provide CSV download parsed
-           def parse_mcqs_to_df(mcq_text: str) -> pd.DataFrame:
-               lines = mcq_text.splitlines()
-               rows = []
-               q_text = None
-               opts = {"A":"","B":"","C":"","D":""}
-               answer = ""
-               for ln in lines:
-                   if not ln.strip():
-                       continue
-                   # Question detection: starts with "Question" or "Q"
-                   if ln.strip().lower().startswith("question"):
-                       if q_text:
-                           rows.append({"question": q_text.strip(), "A": opts["A"].strip(), "B": opts["B"].strip(), "C": opts["C"].strip(), "D": opts["D"].strip(), "answer": answer.strip()})
-                       # reset
-                       parts = ln.split(":",1)
-                       if len(parts) > 1:
-                           q_text = parts[1].strip()
-                       else:
-                           q_text = ln.strip()
-                       opts = {"A":"","B":"","C":"","D":""}
-                       answer = ""
-                   elif ln.strip().startswith("A.") or ln.strip().startswith("A)"):
-                       opts["A"] = ln.strip()[2:].strip()
-                   elif ln.strip().startswith("B.") or ln.strip().startswith("B)"):
-                       opts["B"] = ln.strip()[2:].strip()
-                   elif ln.strip().startswith("C.") or ln.strip().startswith("C)"):
-                       opts["C"] = ln.strip()[2:].strip()
-                   elif ln.strip().startswith("D.") or ln.strip().startswith("D)"):
-                       opts["D"] = ln.strip()[2:].strip()
-                   elif ln.strip().lower().startswith("answer"):
-                       parts = ln.split(":",1)
-                       if len(parts) > 1:
-                           answer = parts[1].strip()
-               if q_text:
-                   rows.append({"question": q_text.strip(), "A": opts["A"].strip(), "B": opts["B"].strip(), "C": opts["C"].strip(), "D": opts["D"].strip(), "answer": answer.strip()})
-               return pd.DataFrame(rows)
-           df_mcq = parse_mcqs_to_df(st.session_state["mcq_text"])
-           if not df_mcq.empty:
-               st.download_button("Download MCQs as CSV", df_mcq.to_csv(index=False), file_name=f"{uploaded_file.name}_mcqs.csv", mime="text/csv")
-       else:
-           st.info("No MCQs generated yet. Click 'Generate MCQs' in the left panel.")
-   with tab3:
-       st.header("Q&A Dashboard")
-       st.markdown("Ask questions about the PDF. Use 'Prepare Q&A' first (computes embeddings).")
-       question_input = st.text_input("Enter your question here:")
-       if st.button("Ask question"):
-           if not st.session_state.get("chunks") or not st.session_state.get("chunk_embeddings"):
-               st.warning("Please click 'Prepare Q&A (create embeddings)' in the left panel first.")
-           elif not question_input.strip():
-               st.error("Please type a question.")
-           else:
-               try:
-                   with st.spinner("Retrieving context and generating answer..."):
-                       ans = answer_question(question_input, st.session_state["chunks"], st.session_state["chunk_embeddings"], emb_model_choice, model_choice, top_k=int(retrieval_k))
-                       timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
-                       st.session_state["qa_history"].append({"question": question_input, "answer": ans, "time": timestamp})
-                       st.success("Answer generated.")
-               except Exception as e:
-                   st.error(f"Q&A failed: {e}")
-       # Show history
-       if st.session_state["qa_history"]:
-           st.markdown("### Recent Q&A")
-           for qa in reversed(st.session_state["qa_history"][-8:]):
-               st.markdown(f"<div class='qa-box'><strong>Q:</strong> {qa['question']}<br/><strong>A:</strong> {qa['answer']}<div class='small-muted'>Time: {qa['time']}</div></div>", unsafe_allow_html=True)
-           # Download Q&A
-           qa_md = "\n\n".join([f"Q: {qa['question']}\nA: {qa['answer']}\nTime: {qa['time']}" for qa in st.session_state["qa_history"]])
-           st.download_button("Download Q&A history (.txt)", qa_md, file_name=f"{uploaded_file.name}_qa_history.txt", mime="text/plain")
-       else:
-           st.info("No Q&A history yet.")
 # -------------------------
 # Footer

+# app.py
 """
 AI Study Assistant - Streamlit Application
 Features:
 import io
 import time
 import base64
+import openai
+#import pypdf2
+from PyPDF2 import PdfReader
+import pdfplumber
+import dotenv # Corrected from python-dotenv
 from typing import List, Tuple, Dict, Optional
 import streamlit as st
+import pdfplumber
+import pandas as pd
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from dotenv import load_dotenv
+import openai
+# Load .env if present (local dev)
+load_dotenv()
 # Streamlit page config
 st.set_page_config(page_title="AI Study Assistant", layout="wide", initial_sidebar_state="expanded")
 # CSS / Fonts (handwriting)
 # -------------------------
 HANDWRITING_FONTS = [
+    "Patrick Hand",
+    "Caveat",
+    "Indie Flower",
+    "Reenie Beanie"
 ]
 google_fonts = "+".join([f"{f.replace(' ', '+')}:wght@400;700" for f in HANDWRITING_FONTS])
 st.markdown(
+    f"<link href=\"https://fonts.googleapis.com/css2?family=Patrick+Hand&family=Caveat&family=Indie+Flower&family=Reenie+Beanie&display=swap\" rel=\"stylesheet\">",
+    unsafe_allow_html=True
 )
 st.markdown(
+    f"""
+    <style>
+      :root {{
+        --handwriting: "{HANDWRITING_FONTS[0]}", "{HANDWRITING_FONTS[1]}", cursive, sans-serif;
+      }}
+      body {{
+        background: linear-gradient(180deg,#fbfbff,#ffffff);
+      }}
+      .handwriting {{
+        font-family: var(--handwriting);
+      }}
+      .mcq-block {{
+        white-space: pre-wrap;
+        font-family: var(--handwriting);
+        padding: 12px;
+        border-radius: 8px;
+        background: #fffdf7;
+        border: 1px solid #f1e6d6;
+      }}
+      .qa-box {{
+        background: #ffffff;
+        border-radius: 8px;
+        padding: 10px;
+        box-shadow: 0 2px 8px rgba(12,12,12,0.05);
+      }}
+      .small-muted {{
+        font-size:12px;color:#6b7280;
+      }}
+      .download-link {{
+        margin-top: 8px;
+      }}
+    </style>
+    """,
+    unsafe_allow_html=True
 )
 # -------------------------
 # API Key input (secure)
 openai_key = st.sidebar.text_input("OpenAI API Key (start with sk-)", type="password", help="Your OpenAI API key. For Spaces add it to Secrets.")
 if openai_key:
+    os.environ["OPENAI_API_KEY"] = openai_key
 elif "OPENAI_API_KEY" in os.environ:
+    openai_key = os.environ.get("OPENAI_API_KEY")
 # Model selection
 model_choice = st.sidebar.selectbox("Generation model", options=["gpt-4", "gpt-4o", "gpt-3.5-turbo"], index=0)
 # OpenAI initialization
 # -------------------------
 def ensure_openai_key():
+    key = os.environ.get("OPENAI_API_KEY", None)
+    if not key:
+        raise RuntimeError("OpenAI API key not found. Set it in the sidebar or add OPENAI_API_KEY to environment.")
+    openai.api_key = key
 # -------------------------
 # PDF extraction utilities
 # -------------------------
 @st.cache_data(show_spinner=False)
 def extract_text_pdfplumber(file_bytes: bytes) -> str:
+    """Extract text using pdfplumber (best for most PDFs). Cached to avoid repeated work."""
+    text_pages = []
+    try:
+        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
+            for p in pdf.pages:
+                txt = p.extract_text()
+                if txt:
+                    text_pages.append(txt)
+    except Exception as e:
+        # Let caller fallback to PyPDF2
+        raise e
+    return "\n\n".join(text_pages).strip()
 @st.cache_data(show_spinner=False)
 def extract_text_pypdf2(file_bytes: bytes) -> str:
+    """Fallback extraction using PyPDF2."""
+    text_pages = []
+    try:
+        reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
+        for page in reader.pages:
+            try:
+                txt = page.extract_text()
+            except Exception:
+                txt = None
+            if txt:
+                text_pages.append(txt)
+    except Exception as e:
+        raise e
+    return "\n\n".join(text_pages).strip()
 def extract_text(file_bytes: bytes) -> str:
+    """Robust extraction: try pdfplumber first, fallback to PyPDF2."""
+    text = ""
+    try:
+        text = extract_text_pdfplumber(file_bytes)
+        if not text:
+            raise ValueError("pdfplumber returned empty text.")
+    except Exception:
+        text = extract_text_pypdf2(file_bytes)
+    return text
 # -------------------------
 # Chunking / embeddings / retrieval
 # -------------------------
 @st.cache_data(show_spinner=False)
 def chunk_text(text: str, words_per_chunk: int = 700, overlap: int = 150) -> List[str]:
+    words = text.split()
+    chunks = []
+    start = 0
+    L = len(words)
+    while start < L:
+        end = min(start + words_per_chunk, L)
+        chunk = " ".join(words[start:end])
+        chunks.append(chunk)
+        start = end - overlap
+        if start < 0:
+            start = 0
+    return chunks
 @st.cache_data(show_spinner=False)
 def get_embeddings(texts: List[str], model: str) -> List[List[float]]:
+    ensure_openai_key()
+    # Batch call to embeddings API
+    resp = openai.Embedding.create(model=model, input=texts)
+    embeddings = [row["embedding"] for row in resp["data"]]
+    return embeddings
 def top_k_chunks(question: str, chunks: List[str], chunk_embs: List[List[float]], k: int = 4, emb_model: str = "text-embedding-3-small"):
+    ensure_openai_key()
+    # compute question embedding
+    q_emb = get_embeddings([question], model=emb_model)[0]
+    sims = cosine_similarity([q_emb], chunk_embs)[0]
+    idx = np.argsort(sims)[-k:][::-1]
+    selected = [chunks[i] for i in idx]
+    return selected, idx
 # -------------------------
 # OpenAI Chat wrappers
 # -------------------------
 def call_chat_completion(messages: List[Dict], model: str = "gpt-3.5-turbo", max_tokens: int = 700, temperature: float = 0.2):
+    ensure_openai_key()
+    try:
+        resp = openai.ChatCompletion.create(
+            model=model,
+            messages=messages,
+            max_tokens=max_tokens,
+            temperature=temperature
+        )
+        return resp["choices"][0]["message"]["content"].strip()
+    except openai.error.OpenAIError as e:
+        raise RuntimeError(f"OpenAI API error: {e}")
 # -------------------------
 # Prompt engineering functions
 # -------------------------
 def generate_summary(full_text: str, model: str = "gpt-4") -> str:
+    """
+    Create a concise but comprehensive summary with headings and key bullets.
+    To reduce tokens we can ask the model to summarize sections first (but here we send full text).
+    """
+    prompt = [
+        {
+            "role": "system",
+            "content": "You are an assistant that summarizes documents for study and revision."
+        },
+        {
+            "role": "user",
+            "content": (
+                "Summarize the following document for exam revision. "
+                "Provide a concise executive summary (3-6 sentences), then key takeaways as bullet points, and a short list of important terms and definitions. "
+                "Use clear headings. Keep the style formal and compact.\n\n"
+                f"Document:\n\n{full_text}"
+            )
+        }
+    ]
+    # Limit tokens to protect cost; large docs may need chunked summarization — user can call again if needed
+    return call_chat_completion(prompt, model=model, max_tokens=900, temperature=0.2)
 def generate_mcqs(full_text: str, model: str = "gpt-4", count: int = 30) -> str:
+    """
+    Generate MCQs formatted consistently. We ask the model to return plaintext in a structured format.
+    """
+    instruction = (
+        f"Create {count} multiple-choice questions (MCQs) based on the document below. "
+        "Each question must have 4 options labeled A, B, C, D and one correct answer. "
+        "Make questions diverse (recall, concept, application). Mark the correct answer on a separate 'Answer:' line. "
+        "Format EXACTLY like this for each question:\n\n"
+        "Question <n>: <question text>\n\n"
+        "    A. <option A>\n"
+        "    B. <option B>\n"
+        "    C. <option C>\n"
+        "D. <option D>\n\n"
+        "Answer: <LETTER>\n\n"
+        "Do NOT include explanations. Keep each question short and clear."
+    )
+    prompt = [
+        {"role": "system", "content": "You are an experienced instructor who writes high-quality MCQs."},
+        {"role": "user", "content": instruction + "\n\nDocument:\n\n" + full_text}
+    ]
+    return call_chat_completion(prompt, model=model, max_tokens=2200, temperature=0.3)
 def answer_question(question: str, chunks: List[str], chunk_embs: List[List[float]], emb_model: str, gen_model: str, top_k: int = 4) -> str:
+    """
+    Retrieval-augmented answer: pick top_k chunks and ask model to answer using only that context.
+    """
+    selected_chunks, idx = top_k_chunks(question, chunks, chunk_embs, k=top_k, emb_model=emb_model)
+    context = "\n\n---\n\n".join(selected_chunks)
+    prompt = [
+        {"role": "system", "content": "You are an assistant that answers questions using the provided context. If the answer is not in the context, say you could not find it."},
+        {"role": "user", "content": f"Context:\n\n{context}\n\nQuestion: {question}\n\nAnswer concisely and cite which chunk indexes (0-based) you used."}
+    ]
+    return call_chat_completion(prompt, model=gen_model, max_tokens=400, temperature=0.2)
 # -------------------------
 # Download helpers
 # -------------------------
 def make_text_download(content: str, filename: str = "study_package.md"):
+    b64 = base64.b64encode(content.encode()).decode()
+    href = f'<a class="download-link" href="data:text/markdown;base64,{b64}" download="{filename}">Download {filename}</a>'
+    return href
 # -------------------------
 # Session state initialization
 # -------------------------
 if "qa_history" not in st.session_state:
+    st.session_state["qa_history"] = []  # list of dicts: question, answer, time
 if "summary" not in st.session_state:
+    st.session_state["summary"] = None
 if "mcq_text" not in st.session_state:
+    st.session_state["mcq_text"] = None
 if "chunks" not in st.session_state:
+    st.session_state["chunks"] = None
 if "chunk_embeddings" not in st.session_state:
+    st.session_state["chunk_embeddings"] = None
 # -------------------------
 # App UI layout
 left_col, right_col = st.columns([1.4, 2])
 with left_col:
+    st.header("Upload & Settings")
+    uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"], help="Choose a PDF with selectable text for best results.")
+    if uploaded_file:
+        # Read bytes
+        file_bytes = uploaded_file.read()
+        st.write(f"**Filename:** {uploaded_file.name} — {len(file_bytes)//1024} KB")
+        # Try extracting text
+        with st.spinner("Extracting text from PDF..."):
+            try:
+                full_text = extract_text(file_bytes)
+                if not full_text or len(full_text.strip()) < 50:
+                    st.warning("Extracted text is short or empty. The PDF may be scanned images. Try another PDF or enable OCR.")
+                else:
+                    st.success(f"Extracted {len(full_text.split())} words from PDF.")
+                    # Save in session
+                    st.session_state["full_text"] = full_text
+            except Exception as e:
+                st.error(f"Failed to extract text: {e}")
+                st.stop()
+    else:
+        st.info("Please upload a PDF to enable summary and MCQ generation.")
+    # Action buttons
+    st.markdown("---")
+    st.header("Generate Content")
+    colA, colB = st.columns([1,1])
+    with colA:
+        if st.button("Generate Summary"):
+            if not uploaded_file:
+                st.error("Upload a PDF first.")
+            else:
+                try:
+                    with st.spinner("Generating summary (OpenAI)..."):
+                        ensure_openai_key()
+                        # If document is very large, you might want to chunk and summarize iteratively.
+                        summary_text = generate_summary(st.session_state["full_text"], model=model_choice)
+                        st.session_state["summary"] = summary_text
+                        st.success("Summary generated.")
+                except Exception as e:
+                    st.error(f"Summary generation failed: {e}")
+    with colB:
+        if st.button(f"Generate {mcq_target} MCQs"):
+            if not uploaded_file:
+                st.error("Upload a PDF first.")
+            else:
+                try:
+                    with st.spinner("Generating MCQs (this may take a moment)..."):
+                        ensure_openai_key()
+                        mcq_text = generate_mcqs(st.session_state["full_text"], model=model_choice, count=int(mcq_target))
+                        st.session_state["mcq_text"] = mcq_text
+                        st.success("MCQs generated.")
+                except Exception as e:
+                    st.error(f"MCQ generation failed: {e}")
+    # Generate both
+    if st.button("Generate Summary + MCQs"):
+        if not uploaded_file:
+            st.error("Upload a PDF first.")
+        else:
+            try:
+                with st.spinner("Generating summary + MCQs..."):
+                    ensure_openai_key()
+                    st.session_state["summary"] = generate_summary(st.session_state["full_text"], model=model_choice)
+                    st.session_state["mcq_text"] = generate_mcqs(st.session_state["full_text"], model=model_choice, count=int(mcq_target))
+                    st.success("Summary and MCQs generated.")
+            except Exception as e:
+                st.error(f"Combined generation failed: {e}")
+    # Prepare retrieval infrastructure
+    if uploaded_file and ("full_text" in st.session_state):
+        if st.button("Prepare Q&A (create embeddings)"):
+            try:
+                with st.spinner("Chunking document and computing embeddings (costly operation)..."):
+                    chunks = chunk_text(st.session_state["full_text"], words_per_chunk=int(chunk_size), overlap=int(chunk_overlap))
+                    st.session_state["chunks"] = chunks
+                    # Compute embeddings (cached)
+                    chunk_embs = get_embeddings(chunks, model=emb_model_choice)
+                    st.session_state["chunk_embeddings"] = chunk_embs
+                    st.success(f"Prepared {len(chunks)} chunks and embeddings for retrieval.")
+            except Exception as e:
+                st.error(f"Failed to prepare embeddings: {e}")
+    st.markdown("---")
+    st.header("Download / Export")
+    st.markdown("After generating content, download a combined study package.")
+    if st.session_state.get("summary") or st.session_state.get("mcq_text") or st.session_state["qa_history"]:
+        # Compose markdown
+        composed = []
+        if st.session_state.get("summary"):
+            composed.append("# Summary\n\n" + st.session_state["summary"] + "\n\n")
+        if st.session_state.get("mcq_text"):
+            composed.append("# MCQs\n\n" + st.session_state["mcq_text"] + "\n\n")
+        if st.session_state.get("qa_history"):
+            qalist = ["# Q&A History\n"]
+            for qa in st.session_state["qa_history"]:
+                qalist.append(f"**Q:** {qa['question']}\n\n**A:** {qa['answer']}\n\n_Time:_ {qa['time']}\n\n")
+            composed.append("\n".join(qalist))
+        package_md = "\n".join(composed)
+        st.markdown(make_text_download(package_md, filename=f"{uploaded_file.name}_study_package.md"), unsafe_allow_html=True)
+        st.download_button("Download study package (.md)", package_md, file_name=f"{uploaded_file.name}_study_package.md", mime="text/markdown")
+    else:
+        st.info("No generated content yet. Run summary/MCQ generation first.")
 with right_col:
+    # Tabs: Summary, MCQ Quiz, Q&A
+    tab1, tab2, tab3 = st.tabs(["\U0001f4d1 Summary", "\U0001f4dd MCQ Quiz", "\u2753 Q&A Dashboard"])
+    with tab1:
+        st.header("Document Summary")
+        if st.session_state.get("summary"):
+            st.markdown("<div class='qa-box handwriting'>", unsafe_allow_html=True)
+            st.markdown(st.session_state["summary"], unsafe_allow_html=True)
+            st.markdown("</div>", unsafe_allow_html=True)
+        else:
+            st.info("No summary yet. Click 'Generate Summary' in the left panel.")
+    with tab2:
+        st.header("Generated MCQs")
+        if st.session_state.get("mcq_text"):
+            # Display with formatting: question line and indented options vertically
+            st.markdown("<div class='mcq-block'>", unsafe_allow_html=True)
+            # We display as preformatted but with handwriting font and indentation
+            st.text_area("MCQs (read-only)", value=st.session_state["mcq_text"], height=420, key="mcq_display")
+            st.markdown("</div>", unsafe_allow_html=True)
+            # Also provide CSV download parsed
+            def parse_mcqs_to_df(mcq_text: str) -> pd.DataFrame:
+                lines = mcq_text.splitlines()
+                rows = []
+                q_text = None
+                opts = {"A":"","B":"","C":"","D":""}
+                answer = ""
+                for ln in lines:
+                    if not ln.strip():
+                        continue
+                    # Question detection: starts with "Question" or "Q"
+                    if ln.strip().lower().startswith("question"):
+                        if q_text:
+                            rows.append({"question": q_text.strip(), "A": opts["A"].strip(), "B": opts["B"].strip(), "C": opts["C"].strip(), "D": opts["D"].strip(), "answer": answer.strip()})
+                        # reset
+                        parts = ln.split(":",1)
+                        if len(parts) > 1:
+                            q_text = parts[1].strip()
+                        else:
+                            q_text = ln.strip()
+                        opts = {"A":"","B":"","C":"","D":""}
+                        answer = ""
+                    elif ln.strip().startswith("A.") or ln.strip().startswith("A)"):
+                        opts["A"] = ln.strip()[2:].strip()
+                    elif ln.strip().startswith("B.") or ln.strip().startswith("B)"):
+                        opts["B"] = ln.strip()[2:].strip()
+                    elif ln.strip().startswith("C.") or ln.strip().startswith("C)"):
+                        opts["C"] = ln.strip()[2:].strip()
+                    elif ln.strip().startswith("D.") or ln.strip().startswith("D)"):
+                        opts["D"] = ln.strip()[2:].strip()
+                    elif ln.strip().lower().startswith("answer"):
+                        parts = ln.split(":",1)
+                        if len(parts) > 1:
+                            answer = parts[1].strip()
+                if q_text:
+                    rows.append({"question": q_text.strip(), "A": opts["A"].strip(), "B": opts["B"].strip(), "C": opts["C"].strip(), "D": opts["D"].strip(), "answer": answer.strip()})
+                return pd.DataFrame(rows)
+            df_mcq = parse_mcqs_to_df(st.session_state["mcq_text"])
+            if not df_mcq.empty:
+                st.download_button("Download MCQs as CSV", df_mcq.to_csv(index=False), file_name=f"{uploaded_file.name}_mcqs.csv", mime="text/csv")
+        else:
+            st.info("No MCQs generated yet. Click 'Generate MCQs' in the left panel.")
+    with tab3:
+        st.header("Q&A Dashboard")
+        st.markdown("Ask questions about the PDF. Use 'Prepare Q&A' first (computes embeddings).")
+        question_input = st.text_input("Enter your question here:")
+        if st.button("Ask question"):
+            if not st.session_state.get("chunks") or not st.session_state.get("chunk_embeddings"):
+                st.warning("Please click 'Prepare Q&A (create embeddings)' in the left panel first.")
+            elif not question_input.strip():
+                st.error("Please type a question.")
+            else:
+                try:
+                    with st.spinner("Retrieving context and generating answer..."):
+                        ans = answer_question(question_input, st.session_state["chunks"], st.session_state["chunk_embeddings"], emb_model_choice, model_choice, top_k=int(retrieval_k))
+                        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
+                        st.session_state["qa_history"].append({"question": question_input, "answer": ans, "time": timestamp})
+                        st.success("Answer generated.")
+                except Exception as e:
+                    st.error(f"Q&A failed: {e}")
+        # Show history
+        if st.session_state["qa_history"]:
+            st.markdown("### Recent Q&A")
+            for qa in reversed(st.session_state["qa_history"][-8:]):
+                st.markdown(f"<div class='qa-box'><strong>Q:</strong> {qa['question']}<br/><strong>A:</strong> {qa['answer']}<div class='small-muted'>Time: {qa['time']}</div></div>", unsafe_allow_html=True)
+            # Download Q&A
+            qa_md = "\n\n".join([f"Q: {qa['question']}\nA: {qa['answer']}\nTime: {qa['time']}" for qa in st.session_state["qa_history"]])
+            st.download_button("Download Q&A history (.txt)", qa_md, file_name=f"{uploaded_file.name}_qa_history.txt", mime="text/plain")
+        else:
+            st.info("No Q&A history yet.")
 # -------------------------
 # Footer