Spaces:

maria355
/

Document-Summerizer-RAG-App

Sleeping

App Files Files Community

Document-Summerizer-RAG-App / app.py

maria355

Update app.py

6edcb67 verified 20 days ago

raw

history blame contribute delete

10 kB

	import streamlit as st
	import os
	import tempfile
	import pandas as pd
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from dotenv import load_dotenv
	from groq import Groq
	import pickle
	import uuid
	load_dotenv()

	# App title and description
	st.set_page_config(page_title="BookScribe AI", layout="wide")
	st.title("📚 BookScribe AI")
	GROQ_API_KEY = os.getenv("GROQ_API_KEY")
	# Use the keys
	groq_client = Groq(api_key=GROQ_API_KEY)
	st.markdown("""
	Transform your PDFs into interactive knowledge bases with personalized summaries.
	Upload a document, choose your learning style, and start exploring!
	""")

	# Initialize session state variables if they don't exist
	if 'user_id' not in st.session_state:
	st.session_state.user_id = str(uuid.uuid4())
	if 'processed_files' not in st.session_state:
	st.session_state.processed_files = []
	if 'current_vector_store' not in st.session_state:
	st.session_state.current_vector_store = None
	if 'current_file_name' not in st.session_state:
	st.session_state.current_file_name = None
	if 'chapter_summaries' not in st.session_state:
	st.session_state.chapter_summaries = {}

	# Sidebar for configuration
	with st.sidebar:
	st.header("Configuration")

	# Learning style selector
	learning_style = st.selectbox(
	"Select your learning style:",
	["Visual learner", "Auditory learner", "Reading/writing learner", "Kinesthetic learner"]
	)

	# Choose LLM model
	llm_model = st.selectbox(
	"Select Groq LLM Model:",
	["llama3-8b-8192", "llama3-70b-8192", "mixtral-8x7b-32768"]
	)

	# Language model parameters
	temperature = st.slider("Temperature", min_value=0.0, max_value=1.0, value=0.3, step=0.1)
	max_tokens = st.slider("Max Tokens", min_value=100, max_value=4000, value=1000, step=100)

	# Initialize the embedding model (Hugging Face)
	@st.cache_resource
	def get_embedding_model():
	return HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2"
	)

	# Process the uploaded PDF
	def process_pdf(pdf_file, file_name):
	# Create a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
	tmp_file.write(pdf_file.getvalue())
	pdf_path = tmp_file.name

	# Load PDF
	loader = PyPDFLoader(pdf_path)
	documents = loader.load()

	# Extract text from documents
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	separators=["\n\n", "\n", " ", ""]
	)
	chunks = text_splitter.split_documents(documents)

	# Group chunks into logical "chapters" - simplified approach
	chapters = []
	current_chapter = []
	current_page = None

	for chunk in chunks:
	page = chunk.metadata.get('page', 0)
	if current_page is None:
	current_page = page

	# Simple heuristic: new page could be new chapter
	if page != current_page and current_chapter:
	chapters.append(current_chapter)
	current_chapter = []

	current_chapter.append(chunk)
	current_page = page

	# Add the last chapter
	if current_chapter:
	chapters.append(current_chapter)

	# Create vector store with embeddings
	embeddings = get_embedding_model()
	vector_store = FAISS.from_documents(chunks, embeddings)

	# Save vector store temporarily
	storage_path = f"temp_storage/{st.session_state.user_id}"
	os.makedirs(storage_path, exist_ok=True)

	with open(f"{storage_path}/{file_name.replace(' ', '_')}.pkl", "wb") as f:
	pickle.dump(vector_store, f)

	# Clean up temp file
	os.unlink(pdf_path)

	return vector_store, chapters

	# Generate chapter summaries with Groq
	def generate_summaries(chapters, learning_style, groq_client, model):
	summaries = {}

	for i, chapter in enumerate(chapters):
	# Combine all text in the chapter
	chapter_text = " ".join([doc.page_content for doc in chapter])

	# Generate prompt based on learning style
	prompt = f"""
	Summarize the following text for a {learning_style}:

	{chapter_text[:3000]} # Limiting to first 3000 chars for API efficiency

	Give a summary that includes:
	1. Main concepts in bullet points
	2. A visual metaphor or analogy
	3. Key takeaways
	"""

	# Call Groq API
	try:
	chat_completion = groq_client.chat.completions.create(
	messages=[
	{
	"role": "user",
	"content": prompt,
	}
	],
	model=model,
	temperature=temperature,
	max_tokens=max_tokens,
	)
	summary = chat_completion.choices[0].message.content
	summaries[f"Chapter {i+1}"] = summary
	except Exception as e:
	st.error(f"Error generating summary: {str(e)}")
	summaries[f"Chapter {i+1}"] = "Error generating summary."

	return summaries

	# File upload section
	uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

	# Process the uploaded file
	if uploaded_file and GROQ_API_KEY:
	# Initialize Groq client
	groq_client = Groq(api_key=GROQ_API_KEY)

	file_name = uploaded_file.name.split('.')[0]

	if st.button("Process PDF"):
	with st.spinner("Processing PDF..."):
	# Process the PDF and get vector store and chapters
	vector_store, chapters = process_pdf(uploaded_file, file_name)

	# Save to session state
	st.session_state.current_vector_store = vector_store
	st.session_state.current_file_name = file_name

	# Generate summaries
	with st.spinner("Generating chapter summaries..."):
	summaries = generate_summaries(
	chapters,
	learning_style,
	groq_client,
	llm_model
	)
	st.session_state.chapter_summaries = summaries

	# Add to processed files if not already there
	if file_name not in st.session_state.processed_files:
	st.session_state.processed_files.append(file_name)

	st.success(f"Successfully processed {file_name}!")

	# Display processed files
	if st.session_state.processed_files:
	st.header("Your Library")

	selected_file = st.selectbox(
	"Select a document to explore:",
	st.session_state.processed_files
	)

	# Load vector store if needed
	if selected_file != st.session_state.current_file_name:
	storage_path = f"temp_storage/{st.session_state.user_id}"
	vector_store_path = f"{storage_path}/{selected_file.replace(' ', '_')}.pkl"

	if os.path.exists(vector_store_path):
	with open(vector_store_path, "rb") as f:
	st.session_state.current_vector_store = pickle.load(f)
	st.session_state.current_file_name = selected_file
	else:
	st.error("Vector store not found. Please reprocess the document.")

	# Display chapter summaries
	if st.session_state.chapter_summaries:
	st.header("Chapter Summaries")

	for chapter, summary in st.session_state.chapter_summaries.items():
	with st.expander(chapter):
	st.markdown(summary)

	# Q&A section
	if st.session_state.current_vector_store and GROQ_API_KEY:
	st.header("Ask Questions About Your Document")

	question = st.text_input("Ask a question about the content:")

	if question and st.button("Get Answer"):
	with st.spinner("Generating answer..."):
	# Initialize Groq client
	groq_client = Groq(api_key=GROQ_API_KEY)

	# Search for relevant documents
	docs = st.session_state.current_vector_store.similarity_search(question, k=3)
	context = "\n\n".join([doc.page_content for doc in docs])

	# Generate prompt
	prompt = f"""
	Answer the following question based on the provided context.

	Context:
	{context}

	Question: {question}

	For a {learning_style}, provide:
	1. A clear, concise answer
	2. An example or illustration if applicable
	3. A connection to any main concepts from the document
	"""

	# Call Groq API
	try:
	chat_completion = groq_client.chat.completions.create(
	messages=[
	{
	"role": "user",
	"content": prompt,
	}
	],
	model=llm_model,
	temperature=temperature,
	max_tokens=max_tokens,
	)
	answer = chat_completion.choices[0].message.content

	st.markdown("### Answer")
	st.markdown(answer)

	# Show sources
	with st.expander("Sources"):
	for i, doc in enumerate(docs):
	st.markdown(f"Source {i+1}")
	st.markdown(doc.page_content)
	st.markdown(f"Page: {doc.metadata.get('page', 'Unknown')}")
	st.divider()

	except Exception as e:
	st.error(f"Error generating answer: {str(e)}")

	# App footer
	st.markdown("---")

	st.markdown("BookScribe AI - Powered by Groq and Hugging Face")