Spaces:

Himel09
/

Generate-Questions-Answers

Sleeping

App Files Files Community

Generate-Questions-Answers / src /streamlit_app.py

Himel09

Update src/streamlit_app.py

5c61de3 verified 3 months ago

raw

history blame contribute delete

5.98 kB

	import os
	import re
	import pandas as pd
	import streamlit as st
	from langchain_community.llms import Ollama
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.embeddings import OllamaEmbeddings
	from langchain_community.vectorstores import Chroma
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_core.output_parsers import StrOutputParser
	from langchain_groq import ChatGroq
	from langchain_huggingface import HuggingFaceEmbeddings




	st.set_page_config(page_title="📘 PDF Q&A Generator", page_icon="🤖", layout="wide")
	st.title("📘 PDF Question–Answer Generator (GORQ + RAG)")

	st.markdown("""
	Welcome! Upload a PDF and ask questions about its content.
	The system will generate answers and save all Q&A pairs as a CSV.
	""")


	st.sidebar.header("🔑 API Settings")
	groq_api_key = st.sidebar.text_input("Enter your Groq API Key:", type="password")

	# Stop execution if API key is missing
	if not groq_api_key or groq_api_key.strip() == "":
	st.warning("⚠️ Please enter your Groq API Key to proceed.")
	st.stop()

	try:
	groq_api_key = groq_api_key.strip()
	llm = ChatGroq(model="llama-3.1-8b-instant", api_key=groq_api_key, temperature=0)

	# Test call: ask a trivial question
	response = llm.invoke("Hello")

	except Exception as e:
	st.error(f"❌ Invalid Groq API Key or connection error: {e}")
	st.stop()


	uploaded_file = st.file_uploader("📄 Upload a PDF file", type=["pdf"])
	if not uploaded_file:
	st.info("Please upload a PDF file to begin.")
	st.stop()


	if "processed" not in st.session_state:
	with st.spinner("📚 Loading and splitting PDF..."):
	pdf_path = os.path.join("temp.pdf")
	with open(pdf_path, "wb") as f:
	f.write(uploaded_file.read())

	loader = PyPDFLoader(pdf_path)
	documents = loader.load()

	splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100)
	texts = splitter.split_documents(documents)

	#embedding = OllamaEmbeddings(model="mxbai-embed-large")
	embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	vectorstore = Chroma.from_documents(documents=texts, embedding=embedding)
	retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})

	st.session_state["retriever"] = retriever
	st.session_state["texts"] = texts
	st.session_state["processed"] = True

	st.success(f"✅ Processed {len(st.session_state['texts'])} text chunks from your PDF.")


	system_prompt = (
	"You are an intelligent question–answer generation assistant. "
	"Your task is to read the provided text content (retrieved from a PDF document) "
	"and create meaningful, diverse, and contextually accurate question–answer pairs.\n\n"
	"Follow these rules strictly:\n"
	"1. Generate clear and concise questions based only on the given text.\n"
	"2. Each question must be answerable from the context — do not invent facts.\n"
	"3. Write the corresponding answer immediately after each question.\n"
	"4. Prefer factual, conceptual, or reasoning-based questions rather than trivial ones.\n"
	"5. Output format must be clean and structured like this:\n\n"
	"Q1: <question text>\n"
	"A1: <answer text>\n\n"
	"Q2: <question text>\n"
	"A2: <answer text>\n\n"
	"6. If the text contains multiple sections, cover all major ideas fairly.\n"
	"7. Avoid repeating the same type of question; vary the question style (factual, analytical, summary, etc.).\n\n"
	"Your output should only include the question–answer pairs. Do not add explanations or comments.\n\n"
	"Here is the context:\n\n{context}"
	)

	prompt = ChatPromptTemplate.from_messages([
	("system", system_prompt),
	("user", "{question}")
	])


	llm = ChatGroq(model="llama-3.1-8b-instant",
	api_key=groq_api_key, temperature=0.7)
	parser = StrOutputParser()


	def create_rag_chain(retriever, model, prompt):
	def fetch_context(user_input):
	if isinstance(user_input, dict):
	user_input = user_input.get("question", "")
	docs = retriever.get_relevant_documents(user_input)
	context_text = "\n\n".join(doc.page_content for doc in docs)
	return {"context": context_text, "question": user_input}


	chain = fetch_context \| prompt \| model \| parser
	return chain

	rag_chain = create_rag_chain(st.session_state["retriever"], llm, prompt)


	def parse_qa_pairs(model_output):
	pattern = r"Q\d+:\s(.?)\nA\d+:\s(.?)(?=\nQ\d+:\|\Z)"
	matches = re.findall(pattern, model_output, re.DOTALL)
	return [{"Question": q.strip(), "Answer": a.strip()} for q, a in matches]


	st.subheader("💬 Ask Questions from the PDF")
	user_question = st.text_input("Enter your question or request Q&A generation:")

	if "qa_data" not in st.session_state:
	st.session_state.qa_data = []

	if st.button("Generate Answer") and user_question.strip():
	with st.spinner("🤖 Generating answer..."):
	rag_chain = create_rag_chain(st.session_state["retriever"], llm, prompt)
	model_output = rag_chain.invoke({"question": user_question})

	# Parse Q&A pairs
	parsed_qa = parse_qa_pairs(model_output)
	st.session_state.qa_data.extend(parsed_qa)

	for i, item in enumerate(parsed_qa, start=1):
	question = item.get("Question", "No Question Found")
	answer = item.get("Answer", "No Answer Found")
	st.markdown(f"Q{i}: {question}")
	st.markdown(f"A{i}: {answer}")
	st.markdown("---") # separator between Q&A




	if st.session_state.qa_data:
	df = pd.DataFrame(st.session_state.qa_data)
	st.download_button(
	label="📥 Download Q&A CSV",
	data=df.to_csv(index=False).encode("utf-8"),
	file_name="qa_results.csv",
	mime="text/csv"
	)