maria355's picture
Update app.py
6edcb67 verified
import streamlit as st
import os
import tempfile
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv
from groq import Groq
import pickle
import uuid
load_dotenv()
# App title and description
st.set_page_config(page_title="BookScribe AI", layout="wide")
st.title("📚 BookScribe AI")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
# Use the keys
groq_client = Groq(api_key=GROQ_API_KEY)
st.markdown("""
Transform your PDFs into interactive knowledge bases with personalized summaries.
Upload a document, choose your learning style, and start exploring!
""")
# Initialize session state variables if they don't exist
if 'user_id' not in st.session_state:
st.session_state.user_id = str(uuid.uuid4())
if 'processed_files' not in st.session_state:
st.session_state.processed_files = []
if 'current_vector_store' not in st.session_state:
st.session_state.current_vector_store = None
if 'current_file_name' not in st.session_state:
st.session_state.current_file_name = None
if 'chapter_summaries' not in st.session_state:
st.session_state.chapter_summaries = {}
# Sidebar for configuration
with st.sidebar:
st.header("Configuration")
# Learning style selector
learning_style = st.selectbox(
"Select your learning style:",
["Visual learner", "Auditory learner", "Reading/writing learner", "Kinesthetic learner"]
)
# Choose LLM model
llm_model = st.selectbox(
"Select Groq LLM Model:",
["llama3-8b-8192", "llama3-70b-8192", "mixtral-8x7b-32768"]
)
# Language model parameters
temperature = st.slider("Temperature", min_value=0.0, max_value=1.0, value=0.3, step=0.1)
max_tokens = st.slider("Max Tokens", min_value=100, max_value=4000, value=1000, step=100)
# Initialize the embedding model (Hugging Face)
@st.cache_resource
def get_embedding_model():
return HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
# Process the uploaded PDF
def process_pdf(pdf_file, file_name):
# Create a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
tmp_file.write(pdf_file.getvalue())
pdf_path = tmp_file.name
# Load PDF
loader = PyPDFLoader(pdf_path)
documents = loader.load()
# Extract text from documents
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
separators=["\n\n", "\n", " ", ""]
)
chunks = text_splitter.split_documents(documents)
# Group chunks into logical "chapters" - simplified approach
chapters = []
current_chapter = []
current_page = None
for chunk in chunks:
page = chunk.metadata.get('page', 0)
if current_page is None:
current_page = page
# Simple heuristic: new page could be new chapter
if page != current_page and current_chapter:
chapters.append(current_chapter)
current_chapter = []
current_chapter.append(chunk)
current_page = page
# Add the last chapter
if current_chapter:
chapters.append(current_chapter)
# Create vector store with embeddings
embeddings = get_embedding_model()
vector_store = FAISS.from_documents(chunks, embeddings)
# Save vector store temporarily
storage_path = f"temp_storage/{st.session_state.user_id}"
os.makedirs(storage_path, exist_ok=True)
with open(f"{storage_path}/{file_name.replace(' ', '_')}.pkl", "wb") as f:
pickle.dump(vector_store, f)
# Clean up temp file
os.unlink(pdf_path)
return vector_store, chapters
# Generate chapter summaries with Groq
def generate_summaries(chapters, learning_style, groq_client, model):
summaries = {}
for i, chapter in enumerate(chapters):
# Combine all text in the chapter
chapter_text = " ".join([doc.page_content for doc in chapter])
# Generate prompt based on learning style
prompt = f"""
Summarize the following text for a {learning_style}:
{chapter_text[:3000]} # Limiting to first 3000 chars for API efficiency
Give a summary that includes:
1. Main concepts in bullet points
2. A visual metaphor or analogy
3. Key takeaways
"""
# Call Groq API
try:
chat_completion = groq_client.chat.completions.create(
messages=[
{
"role": "user",
"content": prompt,
}
],
model=model,
temperature=temperature,
max_tokens=max_tokens,
)
summary = chat_completion.choices[0].message.content
summaries[f"Chapter {i+1}"] = summary
except Exception as e:
st.error(f"Error generating summary: {str(e)}")
summaries[f"Chapter {i+1}"] = "Error generating summary."
return summaries
# File upload section
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
# Process the uploaded file
if uploaded_file and GROQ_API_KEY:
# Initialize Groq client
groq_client = Groq(api_key=GROQ_API_KEY)
file_name = uploaded_file.name.split('.')[0]
if st.button("Process PDF"):
with st.spinner("Processing PDF..."):
# Process the PDF and get vector store and chapters
vector_store, chapters = process_pdf(uploaded_file, file_name)
# Save to session state
st.session_state.current_vector_store = vector_store
st.session_state.current_file_name = file_name
# Generate summaries
with st.spinner("Generating chapter summaries..."):
summaries = generate_summaries(
chapters,
learning_style,
groq_client,
llm_model
)
st.session_state.chapter_summaries = summaries
# Add to processed files if not already there
if file_name not in st.session_state.processed_files:
st.session_state.processed_files.append(file_name)
st.success(f"Successfully processed {file_name}!")
# Display processed files
if st.session_state.processed_files:
st.header("Your Library")
selected_file = st.selectbox(
"Select a document to explore:",
st.session_state.processed_files
)
# Load vector store if needed
if selected_file != st.session_state.current_file_name:
storage_path = f"temp_storage/{st.session_state.user_id}"
vector_store_path = f"{storage_path}/{selected_file.replace(' ', '_')}.pkl"
if os.path.exists(vector_store_path):
with open(vector_store_path, "rb") as f:
st.session_state.current_vector_store = pickle.load(f)
st.session_state.current_file_name = selected_file
else:
st.error("Vector store not found. Please reprocess the document.")
# Display chapter summaries
if st.session_state.chapter_summaries:
st.header("Chapter Summaries")
for chapter, summary in st.session_state.chapter_summaries.items():
with st.expander(chapter):
st.markdown(summary)
# Q&A section
if st.session_state.current_vector_store and GROQ_API_KEY:
st.header("Ask Questions About Your Document")
question = st.text_input("Ask a question about the content:")
if question and st.button("Get Answer"):
with st.spinner("Generating answer..."):
# Initialize Groq client
groq_client = Groq(api_key=GROQ_API_KEY)
# Search for relevant documents
docs = st.session_state.current_vector_store.similarity_search(question, k=3)
context = "\n\n".join([doc.page_content for doc in docs])
# Generate prompt
prompt = f"""
Answer the following question based on the provided context.
Context:
{context}
Question: {question}
For a {learning_style}, provide:
1. A clear, concise answer
2. An example or illustration if applicable
3. A connection to any main concepts from the document
"""
# Call Groq API
try:
chat_completion = groq_client.chat.completions.create(
messages=[
{
"role": "user",
"content": prompt,
}
],
model=llm_model,
temperature=temperature,
max_tokens=max_tokens,
)
answer = chat_completion.choices[0].message.content
st.markdown("### Answer")
st.markdown(answer)
# Show sources
with st.expander("Sources"):
for i, doc in enumerate(docs):
st.markdown(f"**Source {i+1}**")
st.markdown(doc.page_content)
st.markdown(f"*Page: {doc.metadata.get('page', 'Unknown')}*")
st.divider()
except Exception as e:
st.error(f"Error generating answer: {str(e)}")
# App footer
st.markdown("---")
st.markdown("BookScribe AI - Powered by Groq and Hugging Face")