import streamlit as st import os import tempfile import pandas as pd from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.document_loaders import PyPDFLoader from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings from dotenv import load_dotenv from groq import Groq import pickle import uuid load_dotenv() # App title and description st.set_page_config(page_title="BookScribe AI", layout="wide") st.title("📚 BookScribe AI") GROQ_API_KEY = os.getenv("GROQ_API_KEY") # Use the keys groq_client = Groq(api_key=GROQ_API_KEY) st.markdown(""" Transform your PDFs into interactive knowledge bases with personalized summaries. Upload a document, choose your learning style, and start exploring! """) # Initialize session state variables if they don't exist if 'user_id' not in st.session_state: st.session_state.user_id = str(uuid.uuid4()) if 'processed_files' not in st.session_state: st.session_state.processed_files = [] if 'current_vector_store' not in st.session_state: st.session_state.current_vector_store = None if 'current_file_name' not in st.session_state: st.session_state.current_file_name = None if 'chapter_summaries' not in st.session_state: st.session_state.chapter_summaries = {} # Sidebar for configuration with st.sidebar: st.header("Configuration") # Learning style selector learning_style = st.selectbox( "Select your learning style:", ["Visual learner", "Auditory learner", "Reading/writing learner", "Kinesthetic learner"] ) # Choose LLM model llm_model = st.selectbox( "Select Groq LLM Model:", ["llama3-8b-8192", "llama3-70b-8192", "mixtral-8x7b-32768"] ) # Language model parameters temperature = st.slider("Temperature", min_value=0.0, max_value=1.0, value=0.3, step=0.1) max_tokens = st.slider("Max Tokens", min_value=100, max_value=4000, value=1000, step=100) # Initialize the embedding model (Hugging Face) @st.cache_resource def get_embedding_model(): return HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" ) # Process the uploaded PDF def process_pdf(pdf_file, file_name): # Create a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file: tmp_file.write(pdf_file.getvalue()) pdf_path = tmp_file.name # Load PDF loader = PyPDFLoader(pdf_path) documents = loader.load() # Extract text from documents text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, separators=["\n\n", "\n", " ", ""] ) chunks = text_splitter.split_documents(documents) # Group chunks into logical "chapters" - simplified approach chapters = [] current_chapter = [] current_page = None for chunk in chunks: page = chunk.metadata.get('page', 0) if current_page is None: current_page = page # Simple heuristic: new page could be new chapter if page != current_page and current_chapter: chapters.append(current_chapter) current_chapter = [] current_chapter.append(chunk) current_page = page # Add the last chapter if current_chapter: chapters.append(current_chapter) # Create vector store with embeddings embeddings = get_embedding_model() vector_store = FAISS.from_documents(chunks, embeddings) # Save vector store temporarily storage_path = f"temp_storage/{st.session_state.user_id}" os.makedirs(storage_path, exist_ok=True) with open(f"{storage_path}/{file_name.replace(' ', '_')}.pkl", "wb") as f: pickle.dump(vector_store, f) # Clean up temp file os.unlink(pdf_path) return vector_store, chapters # Generate chapter summaries with Groq def generate_summaries(chapters, learning_style, groq_client, model): summaries = {} for i, chapter in enumerate(chapters): # Combine all text in the chapter chapter_text = " ".join([doc.page_content for doc in chapter]) # Generate prompt based on learning style prompt = f""" Summarize the following text for a {learning_style}: {chapter_text[:3000]} # Limiting to first 3000 chars for API efficiency Give a summary that includes: 1. Main concepts in bullet points 2. A visual metaphor or analogy 3. Key takeaways """ # Call Groq API try: chat_completion = groq_client.chat.completions.create( messages=[ { "role": "user", "content": prompt, } ], model=model, temperature=temperature, max_tokens=max_tokens, ) summary = chat_completion.choices[0].message.content summaries[f"Chapter {i+1}"] = summary except Exception as e: st.error(f"Error generating summary: {str(e)}") summaries[f"Chapter {i+1}"] = "Error generating summary." return summaries # File upload section uploaded_file = st.file_uploader("Upload a PDF", type="pdf") # Process the uploaded file if uploaded_file and GROQ_API_KEY: # Initialize Groq client groq_client = Groq(api_key=GROQ_API_KEY) file_name = uploaded_file.name.split('.')[0] if st.button("Process PDF"): with st.spinner("Processing PDF..."): # Process the PDF and get vector store and chapters vector_store, chapters = process_pdf(uploaded_file, file_name) # Save to session state st.session_state.current_vector_store = vector_store st.session_state.current_file_name = file_name # Generate summaries with st.spinner("Generating chapter summaries..."): summaries = generate_summaries( chapters, learning_style, groq_client, llm_model ) st.session_state.chapter_summaries = summaries # Add to processed files if not already there if file_name not in st.session_state.processed_files: st.session_state.processed_files.append(file_name) st.success(f"Successfully processed {file_name}!") # Display processed files if st.session_state.processed_files: st.header("Your Library") selected_file = st.selectbox( "Select a document to explore:", st.session_state.processed_files ) # Load vector store if needed if selected_file != st.session_state.current_file_name: storage_path = f"temp_storage/{st.session_state.user_id}" vector_store_path = f"{storage_path}/{selected_file.replace(' ', '_')}.pkl" if os.path.exists(vector_store_path): with open(vector_store_path, "rb") as f: st.session_state.current_vector_store = pickle.load(f) st.session_state.current_file_name = selected_file else: st.error("Vector store not found. Please reprocess the document.") # Display chapter summaries if st.session_state.chapter_summaries: st.header("Chapter Summaries") for chapter, summary in st.session_state.chapter_summaries.items(): with st.expander(chapter): st.markdown(summary) # Q&A section if st.session_state.current_vector_store and GROQ_API_KEY: st.header("Ask Questions About Your Document") question = st.text_input("Ask a question about the content:") if question and st.button("Get Answer"): with st.spinner("Generating answer..."): # Initialize Groq client groq_client = Groq(api_key=GROQ_API_KEY) # Search for relevant documents docs = st.session_state.current_vector_store.similarity_search(question, k=3) context = "\n\n".join([doc.page_content for doc in docs]) # Generate prompt prompt = f""" Answer the following question based on the provided context. Context: {context} Question: {question} For a {learning_style}, provide: 1. A clear, concise answer 2. An example or illustration if applicable 3. A connection to any main concepts from the document """ # Call Groq API try: chat_completion = groq_client.chat.completions.create( messages=[ { "role": "user", "content": prompt, } ], model=llm_model, temperature=temperature, max_tokens=max_tokens, ) answer = chat_completion.choices[0].message.content st.markdown("### Answer") st.markdown(answer) # Show sources with st.expander("Sources"): for i, doc in enumerate(docs): st.markdown(f"**Source {i+1}**") st.markdown(doc.page_content) st.markdown(f"*Page: {doc.metadata.get('page', 'Unknown')}*") st.divider() except Exception as e: st.error(f"Error generating answer: {str(e)}") # App footer st.markdown("---") st.markdown("BookScribe AI - Powered by Groq and Hugging Face")