|
|
import streamlit as st |
|
|
import os |
|
|
import tempfile |
|
|
import pandas as pd |
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
from langchain_community.document_loaders import PyPDFLoader |
|
|
from langchain_community.vectorstores import FAISS |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
from dotenv import load_dotenv |
|
|
from groq import Groq |
|
|
import pickle |
|
|
import uuid |
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
st.set_page_config(page_title="BookScribe AI", layout="wide") |
|
|
st.title("📚 BookScribe AI") |
|
|
GROQ_API_KEY = os.getenv("GROQ_API_KEY") |
|
|
|
|
|
groq_client = Groq(api_key=GROQ_API_KEY) |
|
|
st.markdown(""" |
|
|
Transform your PDFs into interactive knowledge bases with personalized summaries. |
|
|
Upload a document, choose your learning style, and start exploring! |
|
|
""") |
|
|
|
|
|
|
|
|
if 'user_id' not in st.session_state: |
|
|
st.session_state.user_id = str(uuid.uuid4()) |
|
|
if 'processed_files' not in st.session_state: |
|
|
st.session_state.processed_files = [] |
|
|
if 'current_vector_store' not in st.session_state: |
|
|
st.session_state.current_vector_store = None |
|
|
if 'current_file_name' not in st.session_state: |
|
|
st.session_state.current_file_name = None |
|
|
if 'chapter_summaries' not in st.session_state: |
|
|
st.session_state.chapter_summaries = {} |
|
|
|
|
|
|
|
|
with st.sidebar: |
|
|
st.header("Configuration") |
|
|
|
|
|
|
|
|
learning_style = st.selectbox( |
|
|
"Select your learning style:", |
|
|
["Visual learner", "Auditory learner", "Reading/writing learner", "Kinesthetic learner"] |
|
|
) |
|
|
|
|
|
|
|
|
llm_model = st.selectbox( |
|
|
"Select Groq LLM Model:", |
|
|
["llama3-8b-8192", "llama3-70b-8192", "mixtral-8x7b-32768"] |
|
|
) |
|
|
|
|
|
|
|
|
temperature = st.slider("Temperature", min_value=0.0, max_value=1.0, value=0.3, step=0.1) |
|
|
max_tokens = st.slider("Max Tokens", min_value=100, max_value=4000, value=1000, step=100) |
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def get_embedding_model(): |
|
|
return HuggingFaceEmbeddings( |
|
|
model_name="sentence-transformers/all-MiniLM-L6-v2" |
|
|
) |
|
|
|
|
|
|
|
|
def process_pdf(pdf_file, file_name): |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file: |
|
|
tmp_file.write(pdf_file.getvalue()) |
|
|
pdf_path = tmp_file.name |
|
|
|
|
|
|
|
|
loader = PyPDFLoader(pdf_path) |
|
|
documents = loader.load() |
|
|
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=1000, |
|
|
chunk_overlap=200, |
|
|
separators=["\n\n", "\n", " ", ""] |
|
|
) |
|
|
chunks = text_splitter.split_documents(documents) |
|
|
|
|
|
|
|
|
chapters = [] |
|
|
current_chapter = [] |
|
|
current_page = None |
|
|
|
|
|
for chunk in chunks: |
|
|
page = chunk.metadata.get('page', 0) |
|
|
if current_page is None: |
|
|
current_page = page |
|
|
|
|
|
|
|
|
if page != current_page and current_chapter: |
|
|
chapters.append(current_chapter) |
|
|
current_chapter = [] |
|
|
|
|
|
current_chapter.append(chunk) |
|
|
current_page = page |
|
|
|
|
|
|
|
|
if current_chapter: |
|
|
chapters.append(current_chapter) |
|
|
|
|
|
|
|
|
embeddings = get_embedding_model() |
|
|
vector_store = FAISS.from_documents(chunks, embeddings) |
|
|
|
|
|
|
|
|
storage_path = f"temp_storage/{st.session_state.user_id}" |
|
|
os.makedirs(storage_path, exist_ok=True) |
|
|
|
|
|
with open(f"{storage_path}/{file_name.replace(' ', '_')}.pkl", "wb") as f: |
|
|
pickle.dump(vector_store, f) |
|
|
|
|
|
|
|
|
os.unlink(pdf_path) |
|
|
|
|
|
return vector_store, chapters |
|
|
|
|
|
|
|
|
def generate_summaries(chapters, learning_style, groq_client, model): |
|
|
summaries = {} |
|
|
|
|
|
for i, chapter in enumerate(chapters): |
|
|
|
|
|
chapter_text = " ".join([doc.page_content for doc in chapter]) |
|
|
|
|
|
|
|
|
prompt = f""" |
|
|
Summarize the following text for a {learning_style}: |
|
|
|
|
|
{chapter_text[:3000]} # Limiting to first 3000 chars for API efficiency |
|
|
|
|
|
Give a summary that includes: |
|
|
1. Main concepts in bullet points |
|
|
2. A visual metaphor or analogy |
|
|
3. Key takeaways |
|
|
""" |
|
|
|
|
|
|
|
|
try: |
|
|
chat_completion = groq_client.chat.completions.create( |
|
|
messages=[ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": prompt, |
|
|
} |
|
|
], |
|
|
model=model, |
|
|
temperature=temperature, |
|
|
max_tokens=max_tokens, |
|
|
) |
|
|
summary = chat_completion.choices[0].message.content |
|
|
summaries[f"Chapter {i+1}"] = summary |
|
|
except Exception as e: |
|
|
st.error(f"Error generating summary: {str(e)}") |
|
|
summaries[f"Chapter {i+1}"] = "Error generating summary." |
|
|
|
|
|
return summaries |
|
|
|
|
|
|
|
|
uploaded_file = st.file_uploader("Upload a PDF", type="pdf") |
|
|
|
|
|
|
|
|
if uploaded_file and GROQ_API_KEY: |
|
|
|
|
|
groq_client = Groq(api_key=GROQ_API_KEY) |
|
|
|
|
|
file_name = uploaded_file.name.split('.')[0] |
|
|
|
|
|
if st.button("Process PDF"): |
|
|
with st.spinner("Processing PDF..."): |
|
|
|
|
|
vector_store, chapters = process_pdf(uploaded_file, file_name) |
|
|
|
|
|
|
|
|
st.session_state.current_vector_store = vector_store |
|
|
st.session_state.current_file_name = file_name |
|
|
|
|
|
|
|
|
with st.spinner("Generating chapter summaries..."): |
|
|
summaries = generate_summaries( |
|
|
chapters, |
|
|
learning_style, |
|
|
groq_client, |
|
|
llm_model |
|
|
) |
|
|
st.session_state.chapter_summaries = summaries |
|
|
|
|
|
|
|
|
if file_name not in st.session_state.processed_files: |
|
|
st.session_state.processed_files.append(file_name) |
|
|
|
|
|
st.success(f"Successfully processed {file_name}!") |
|
|
|
|
|
|
|
|
if st.session_state.processed_files: |
|
|
st.header("Your Library") |
|
|
|
|
|
selected_file = st.selectbox( |
|
|
"Select a document to explore:", |
|
|
st.session_state.processed_files |
|
|
) |
|
|
|
|
|
|
|
|
if selected_file != st.session_state.current_file_name: |
|
|
storage_path = f"temp_storage/{st.session_state.user_id}" |
|
|
vector_store_path = f"{storage_path}/{selected_file.replace(' ', '_')}.pkl" |
|
|
|
|
|
if os.path.exists(vector_store_path): |
|
|
with open(vector_store_path, "rb") as f: |
|
|
st.session_state.current_vector_store = pickle.load(f) |
|
|
st.session_state.current_file_name = selected_file |
|
|
else: |
|
|
st.error("Vector store not found. Please reprocess the document.") |
|
|
|
|
|
|
|
|
if st.session_state.chapter_summaries: |
|
|
st.header("Chapter Summaries") |
|
|
|
|
|
for chapter, summary in st.session_state.chapter_summaries.items(): |
|
|
with st.expander(chapter): |
|
|
st.markdown(summary) |
|
|
|
|
|
|
|
|
if st.session_state.current_vector_store and GROQ_API_KEY: |
|
|
st.header("Ask Questions About Your Document") |
|
|
|
|
|
question = st.text_input("Ask a question about the content:") |
|
|
|
|
|
if question and st.button("Get Answer"): |
|
|
with st.spinner("Generating answer..."): |
|
|
|
|
|
groq_client = Groq(api_key=GROQ_API_KEY) |
|
|
|
|
|
|
|
|
docs = st.session_state.current_vector_store.similarity_search(question, k=3) |
|
|
context = "\n\n".join([doc.page_content for doc in docs]) |
|
|
|
|
|
|
|
|
prompt = f""" |
|
|
Answer the following question based on the provided context. |
|
|
|
|
|
Context: |
|
|
{context} |
|
|
|
|
|
Question: {question} |
|
|
|
|
|
For a {learning_style}, provide: |
|
|
1. A clear, concise answer |
|
|
2. An example or illustration if applicable |
|
|
3. A connection to any main concepts from the document |
|
|
""" |
|
|
|
|
|
|
|
|
try: |
|
|
chat_completion = groq_client.chat.completions.create( |
|
|
messages=[ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": prompt, |
|
|
} |
|
|
], |
|
|
model=llm_model, |
|
|
temperature=temperature, |
|
|
max_tokens=max_tokens, |
|
|
) |
|
|
answer = chat_completion.choices[0].message.content |
|
|
|
|
|
st.markdown("### Answer") |
|
|
st.markdown(answer) |
|
|
|
|
|
|
|
|
with st.expander("Sources"): |
|
|
for i, doc in enumerate(docs): |
|
|
st.markdown(f"**Source {i+1}**") |
|
|
st.markdown(doc.page_content) |
|
|
st.markdown(f"*Page: {doc.metadata.get('page', 'Unknown')}*") |
|
|
st.divider() |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"Error generating answer: {str(e)}") |
|
|
|
|
|
|
|
|
st.markdown("---") |
|
|
|
|
|
st.markdown("BookScribe AI - Powered by Groq and Hugging Face") |
|
|
|