# app.py """ AI Study Assistant - Streamlit Application Features: - Upload PDF, extract text (pdfplumber / PyPDF2 fallback) - Summarize document using OpenAI Chat API - Generate 25+ MCQs (4 options each) using OpenAI - Retrieval-based Q&A (embeddings + similarity) - Handwriting-style fonts and professional UI - Download combined output (summary, MCQs, Q&A history) as markdown (.md/.txt) - Caching and basic cost-optimizations """ import os import io import time import base64 import openai #import pypdf2 from PyPDF2 import PdfReader import pdfplumber import dotenv # Corrected from python-dotenv from typing import List, Tuple, Dict, Optional import streamlit as st import pdfplumber import pandas as pd import numpy as np from sklearn.metrics.pairwise import cosine_similarity from dotenv import load_dotenv import openai # Load .env if present (local dev) load_dotenv() # Streamlit page config st.set_page_config(page_title="AI Study Assistant", layout="wide", initial_sidebar_state="expanded") # ------------------------- # CSS / Fonts (handwriting) # ------------------------- HANDWRITING_FONTS = [ "Patrick Hand", "Caveat", "Indie Flower", "Reenie Beanie" ] google_fonts = "+".join([f"{f.replace(' ', '+')}:wght@400;700" for f in HANDWRITING_FONTS]) st.markdown( f"", unsafe_allow_html=True ) st.markdown( f""" """, unsafe_allow_html=True ) # ------------------------- # Sidebar inputs / config # ------------------------- st.sidebar.title("AI Study Assistant — Settings") # API Key input (secure) openai_key = st.sidebar.text_input("OpenAI API Key (start with sk-)", type="password", help="Your OpenAI API key. For Spaces add it to Secrets.") if openai_key: os.environ["OPENAI_API_KEY"] = openai_key elif "OPENAI_API_KEY" in os.environ: openai_key = os.environ.get("OPENAI_API_KEY") # Model selection model_choice = st.sidebar.selectbox("Generation model", options=["gpt-4", "gpt-4o", "gpt-3.5-turbo"], index=0) emb_model_choice = st.sidebar.selectbox("Embedding model", options=["text-embedding-3-small", "text-embedding-3-large"], index=0) # MCQ count (min 25) mcq_target = st.sidebar.number_input("Target number of MCQs", min_value=25, max_value=200, value=30, step=1) # Chunk/retrieval settings chunk_size = st.sidebar.number_input("Chunk size (words)", min_value=200, max_value=2000, value=700, step=50) chunk_overlap = st.sidebar.number_input("Chunk overlap (words)", min_value=50, max_value=500, value=150, step=10) retrieval_k = st.sidebar.number_input("Retrieval top-k", min_value=1, max_value=8, value=4, step=1) st.sidebar.markdown("---") st.sidebar.markdown("**Tips:** Use PDFs with selectable text for best results. Scanned PDFs may require OCR.") # ------------------------- # OpenAI initialization # ------------------------- def ensure_openai_key(): key = os.environ.get("OPENAI_API_KEY", None) if not key: raise RuntimeError("OpenAI API key not found. Set it in the sidebar or add OPENAI_API_KEY to environment.") openai.api_key = key # ------------------------- # PDF extraction utilities # ------------------------- @st.cache_data(show_spinner=False) def extract_text_pdfplumber(file_bytes: bytes) -> str: """Extract text using pdfplumber (best for most PDFs). Cached to avoid repeated work.""" text_pages = [] try: with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: for p in pdf.pages: txt = p.extract_text() if txt: text_pages.append(txt) except Exception as e: # Let caller fallback to PyPDF2 raise e return "\n\n".join(text_pages).strip() @st.cache_data(show_spinner=False) def extract_text_pypdf2(file_bytes: bytes) -> str: """Fallback extraction using PyPDF2.""" text_pages = [] try: reader = PyPDF2.PdfReader(io.BytesIO(file_bytes)) for page in reader.pages: try: txt = page.extract_text() except Exception: txt = None if txt: text_pages.append(txt) except Exception as e: raise e return "\n\n".join(text_pages).strip() def extract_text(file_bytes: bytes) -> str: """Robust extraction: try pdfplumber first, fallback to PyPDF2.""" text = "" try: text = extract_text_pdfplumber(file_bytes) if not text: raise ValueError("pdfplumber returned empty text.") except Exception: text = extract_text_pypdf2(file_bytes) return text # ------------------------- # Chunking / embeddings / retrieval # ------------------------- @st.cache_data(show_spinner=False) def chunk_text(text: str, words_per_chunk: int = 700, overlap: int = 150) -> List[str]: words = text.split() chunks = [] start = 0 L = len(words) while start < L: end = min(start + words_per_chunk, L) chunk = " ".join(words[start:end]) chunks.append(chunk) start = end - overlap if start < 0: start = 0 return chunks @st.cache_data(show_spinner=False) def get_embeddings(texts: List[str], model: str) -> List[List[float]]: ensure_openai_key() # Batch call to embeddings API resp = openai.Embedding.create(model=model, input=texts) embeddings = [row["embedding"] for row in resp["data"]] return embeddings def top_k_chunks(question: str, chunks: List[str], chunk_embs: List[List[float]], k: int = 4, emb_model: str = "text-embedding-3-small"): ensure_openai_key() # compute question embedding q_emb = get_embeddings([question], model=emb_model)[0] sims = cosine_similarity([q_emb], chunk_embs)[0] idx = np.argsort(sims)[-k:][::-1] selected = [chunks[i] for i in idx] return selected, idx # ------------------------- # OpenAI Chat wrappers # ------------------------- def call_chat_completion(messages: List[Dict], model: str = "gpt-3.5-turbo", max_tokens: int = 700, temperature: float = 0.2): ensure_openai_key() try: resp = openai.ChatCompletion.create( model=model, messages=messages, max_tokens=max_tokens, temperature=temperature ) return resp["choices"][0]["message"]["content"].strip() except openai.error.OpenAIError as e: raise RuntimeError(f"OpenAI API error: {e}") # ------------------------- # Prompt engineering functions # ------------------------- def generate_summary(full_text: str, model: str = "gpt-4") -> str: """ Create a concise but comprehensive summary with headings and key bullets. To reduce tokens we can ask the model to summarize sections first (but here we send full text). """ prompt = [ { "role": "system", "content": "You are an assistant that summarizes documents for study and revision." }, { "role": "user", "content": ( "Summarize the following document for exam revision. " "Provide a concise executive summary (3-6 sentences), then key takeaways as bullet points, and a short list of important terms and definitions. " "Use clear headings. Keep the style formal and compact.\n\n" f"Document:\n\n{full_text}" ) } ] # Limit tokens to protect cost; large docs may need chunked summarization — user can call again if needed return call_chat_completion(prompt, model=model, max_tokens=900, temperature=0.2) def generate_mcqs(full_text: str, model: str = "gpt-4", count: int = 30) -> str: """ Generate MCQs formatted consistently. We ask the model to return plaintext in a structured format. """ instruction = ( f"Create {count} multiple-choice questions (MCQs) based on the document below. " "Each question must have 4 options labeled A, B, C, D and one correct answer. " "Make questions diverse (recall, concept, application). Mark the correct answer on a separate 'Answer:' line. " "Format EXACTLY like this for each question:\n\n" "Question : \n\n" " A.