import os import re import pandas as pd import streamlit as st from langchain_community.llms import Ollama from langchain_community.document_loaders import PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.embeddings import OllamaEmbeddings from langchain_community.vectorstores import Chroma from langchain_core.prompts import ChatPromptTemplate from langchain_core.output_parsers import StrOutputParser from langchain_groq import ChatGroq from langchain_huggingface import HuggingFaceEmbeddings st.set_page_config(page_title="📘 PDF Q&A Generator", page_icon="🤖", layout="wide") st.title("📘 PDF Question–Answer Generator (GORQ + RAG)") st.markdown(""" Welcome! Upload a PDF and ask questions about its content. The system will generate answers and save all Q&A pairs as a CSV. """) st.sidebar.header("🔑 API Settings") groq_api_key = st.sidebar.text_input("Enter your Groq API Key:", type="password") # Stop execution if API key is missing if not groq_api_key or groq_api_key.strip() == "": st.warning("⚠️ Please enter your Groq API Key to proceed.") st.stop() try: groq_api_key = groq_api_key.strip() llm = ChatGroq(model="llama-3.1-8b-instant", api_key=groq_api_key, temperature=0) # Test call: ask a trivial question response = llm.invoke("Hello") except Exception as e: st.error(f"❌ Invalid Groq API Key or connection error: {e}") st.stop() uploaded_file = st.file_uploader("📄 Upload a PDF file", type=["pdf"]) if not uploaded_file: st.info("Please upload a PDF file to begin.") st.stop() if "processed" not in st.session_state: with st.spinner("📚 Loading and splitting PDF..."): pdf_path = os.path.join("temp.pdf") with open(pdf_path, "wb") as f: f.write(uploaded_file.read()) loader = PyPDFLoader(pdf_path) documents = loader.load() splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100) texts = splitter.split_documents(documents) #embedding = OllamaEmbeddings(model="mxbai-embed-large") embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") vectorstore = Chroma.from_documents(documents=texts, embedding=embedding) retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4}) st.session_state["retriever"] = retriever st.session_state["texts"] = texts st.session_state["processed"] = True st.success(f"✅ Processed {len(st.session_state['texts'])} text chunks from your PDF.") system_prompt = ( "You are an intelligent question–answer generation assistant. " "Your task is to read the provided text content (retrieved from a PDF document) " "and create meaningful, diverse, and contextually accurate question–answer pairs.\n\n" "Follow these rules strictly:\n" "1. Generate clear and concise questions based only on the given text.\n" "2. Each question must be answerable from the context — do not invent facts.\n" "3. Write the corresponding answer immediately after each question.\n" "4. Prefer factual, conceptual, or reasoning-based questions rather than trivial ones.\n" "5. Output format must be clean and structured like this:\n\n" "Q1: \n" "A1: \n\n" "Q2: \n" "A2: \n\n" "6. If the text contains multiple sections, cover all major ideas fairly.\n" "7. Avoid repeating the same type of question; vary the question style (factual, analytical, summary, etc.).\n\n" "Your output should only include the question–answer pairs. Do not add explanations or comments.\n\n" "Here is the context:\n\n{context}" ) prompt = ChatPromptTemplate.from_messages([ ("system", system_prompt), ("user", "{question}") ]) llm = ChatGroq(model="llama-3.1-8b-instant", api_key=groq_api_key, temperature=0.7) parser = StrOutputParser() def create_rag_chain(retriever, model, prompt): def fetch_context(user_input): if isinstance(user_input, dict): user_input = user_input.get("question", "") docs = retriever.get_relevant_documents(user_input) context_text = "\n\n".join(doc.page_content for doc in docs) return {"context": context_text, "question": user_input} chain = fetch_context | prompt | model | parser return chain rag_chain = create_rag_chain(st.session_state["retriever"], llm, prompt) def parse_qa_pairs(model_output): pattern = r"Q\d+:\s*(.*?)\nA\d+:\s*(.*?)(?=\nQ\d+:|\Z)" matches = re.findall(pattern, model_output, re.DOTALL) return [{"Question": q.strip(), "Answer": a.strip()} for q, a in matches] st.subheader("💬 Ask Questions from the PDF") user_question = st.text_input("Enter your question or request Q&A generation:") if "qa_data" not in st.session_state: st.session_state.qa_data = [] if st.button("Generate Answer") and user_question.strip(): with st.spinner("🤖 Generating answer..."): rag_chain = create_rag_chain(st.session_state["retriever"], llm, prompt) model_output = rag_chain.invoke({"question": user_question}) # Parse Q&A pairs parsed_qa = parse_qa_pairs(model_output) st.session_state.qa_data.extend(parsed_qa) for i, item in enumerate(parsed_qa, start=1): question = item.get("Question", "No Question Found") answer = item.get("Answer", "No Answer Found") st.markdown(f"**Q{i}:** {question}") st.markdown(f"**A{i}:** {answer}") st.markdown("---") # separator between Q&A if st.session_state.qa_data: df = pd.DataFrame(st.session_state.qa_data) st.download_button( label="📥 Download Q&A CSV", data=df.to_csv(index=False).encode("utf-8"), file_name="qa_results.csv", mime="text/csv" )