Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import pandas as pd | |
| import streamlit as st | |
| from langchain_community.llms import Ollama | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import OllamaEmbeddings | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_groq import ChatGroq | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| st.set_page_config(page_title="📘 PDF Q&A Generator", page_icon="🤖", layout="wide") | |
| st.title("📘 PDF Question–Answer Generator (GORQ + RAG)") | |
| st.markdown(""" | |
| Welcome! Upload a PDF and ask questions about its content. | |
| The system will generate answers and save all Q&A pairs as a CSV. | |
| """) | |
| st.sidebar.header("🔑 API Settings") | |
| groq_api_key = st.sidebar.text_input("Enter your Groq API Key:", type="password") | |
| # Stop execution if API key is missing | |
| if not groq_api_key or groq_api_key.strip() == "": | |
| st.warning("⚠️ Please enter your Groq API Key to proceed.") | |
| st.stop() | |
| try: | |
| groq_api_key = groq_api_key.strip() | |
| llm = ChatGroq(model="llama-3.1-8b-instant", api_key=groq_api_key, temperature=0) | |
| # Test call: ask a trivial question | |
| response = llm.invoke("Hello") | |
| except Exception as e: | |
| st.error(f"❌ Invalid Groq API Key or connection error: {e}") | |
| st.stop() | |
| uploaded_file = st.file_uploader("📄 Upload a PDF file", type=["pdf"]) | |
| if not uploaded_file: | |
| st.info("Please upload a PDF file to begin.") | |
| st.stop() | |
| if "processed" not in st.session_state: | |
| with st.spinner("📚 Loading and splitting PDF..."): | |
| pdf_path = os.path.join("temp.pdf") | |
| with open(pdf_path, "wb") as f: | |
| f.write(uploaded_file.read()) | |
| loader = PyPDFLoader(pdf_path) | |
| documents = loader.load() | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100) | |
| texts = splitter.split_documents(documents) | |
| #embedding = OllamaEmbeddings(model="mxbai-embed-large") | |
| embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| vectorstore = Chroma.from_documents(documents=texts, embedding=embedding) | |
| retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4}) | |
| st.session_state["retriever"] = retriever | |
| st.session_state["texts"] = texts | |
| st.session_state["processed"] = True | |
| st.success(f"✅ Processed {len(st.session_state['texts'])} text chunks from your PDF.") | |
| system_prompt = ( | |
| "You are an intelligent question–answer generation assistant. " | |
| "Your task is to read the provided text content (retrieved from a PDF document) " | |
| "and create meaningful, diverse, and contextually accurate question–answer pairs.\n\n" | |
| "Follow these rules strictly:\n" | |
| "1. Generate clear and concise questions based only on the given text.\n" | |
| "2. Each question must be answerable from the context — do not invent facts.\n" | |
| "3. Write the corresponding answer immediately after each question.\n" | |
| "4. Prefer factual, conceptual, or reasoning-based questions rather than trivial ones.\n" | |
| "5. Output format must be clean and structured like this:\n\n" | |
| "Q1: <question text>\n" | |
| "A1: <answer text>\n\n" | |
| "Q2: <question text>\n" | |
| "A2: <answer text>\n\n" | |
| "6. If the text contains multiple sections, cover all major ideas fairly.\n" | |
| "7. Avoid repeating the same type of question; vary the question style (factual, analytical, summary, etc.).\n\n" | |
| "Your output should only include the question–answer pairs. Do not add explanations or comments.\n\n" | |
| "Here is the context:\n\n{context}" | |
| ) | |
| prompt = ChatPromptTemplate.from_messages([ | |
| ("system", system_prompt), | |
| ("user", "{question}") | |
| ]) | |
| llm = ChatGroq(model="llama-3.1-8b-instant", | |
| api_key=groq_api_key, temperature=0.7) | |
| parser = StrOutputParser() | |
| def create_rag_chain(retriever, model, prompt): | |
| def fetch_context(user_input): | |
| if isinstance(user_input, dict): | |
| user_input = user_input.get("question", "") | |
| docs = retriever.get_relevant_documents(user_input) | |
| context_text = "\n\n".join(doc.page_content for doc in docs) | |
| return {"context": context_text, "question": user_input} | |
| chain = fetch_context | prompt | model | parser | |
| return chain | |
| rag_chain = create_rag_chain(st.session_state["retriever"], llm, prompt) | |
| def parse_qa_pairs(model_output): | |
| pattern = r"Q\d+:\s*(.*?)\nA\d+:\s*(.*?)(?=\nQ\d+:|\Z)" | |
| matches = re.findall(pattern, model_output, re.DOTALL) | |
| return [{"Question": q.strip(), "Answer": a.strip()} for q, a in matches] | |
| st.subheader("💬 Ask Questions from the PDF") | |
| user_question = st.text_input("Enter your question or request Q&A generation:") | |
| if "qa_data" not in st.session_state: | |
| st.session_state.qa_data = [] | |
| if st.button("Generate Answer") and user_question.strip(): | |
| with st.spinner("🤖 Generating answer..."): | |
| rag_chain = create_rag_chain(st.session_state["retriever"], llm, prompt) | |
| model_output = rag_chain.invoke({"question": user_question}) | |
| # Parse Q&A pairs | |
| parsed_qa = parse_qa_pairs(model_output) | |
| st.session_state.qa_data.extend(parsed_qa) | |
| for i, item in enumerate(parsed_qa, start=1): | |
| question = item.get("Question", "No Question Found") | |
| answer = item.get("Answer", "No Answer Found") | |
| st.markdown(f"**Q{i}:** {question}") | |
| st.markdown(f"**A{i}:** {answer}") | |
| st.markdown("---") # separator between Q&A | |
| if st.session_state.qa_data: | |
| df = pd.DataFrame(st.session_state.qa_data) | |
| st.download_button( | |
| label="📥 Download Q&A CSV", | |
| data=df.to_csv(index=False).encode("utf-8"), | |
| file_name="qa_results.csv", | |
| mime="text/csv" | |
| ) | |