Spaces:

Himel09
/

Generate-Questions-Answers

Sleeping

App Files Files Community

Himel09 commited on Oct 26, 2025

Commit

f3c9795

verified ·

1 Parent(s): 81aead1

Create src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +151 -37

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,154 @@
-import altair as alt
-import numpy as np
 import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+import os
+import re
 import pandas as pd
 import streamlit as st
+from langchain_community.llms import Ollama
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import OllamaEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_groq import ChatGroq
+st.set_page_config(page_title="📘 PDF Q&A Generator", page_icon="🤖", layout="wide")
+st.title("📘 PDF Question–Answer Generator (GORQ + RAG)")
+st.markdown("""
+Welcome! Upload a PDF and ask questions about its content.
+The system will generate answers and save all Q&A pairs as a CSV.
+""")
+st.sidebar.header("🔑 API Settings")
+groq_api_key = st.sidebar.text_input("Enter your Groq API Key:", type="password")
+# Stop execution if API key is missing
+if not groq_api_key or groq_api_key.strip() == "":
+    st.warning("⚠️ Please enter your Groq API Key to proceed.")
+    st.stop()
+try:
+    groq_api_key = groq_api_key.strip()
+    llm = ChatGroq(model="llama-3.1-8b-instant", api_key=groq_api_key, temperature=0)
+    # Test call: ask a trivial question
+    response = llm.invoke("Hello")
+except Exception as e:
+    st.error(f"❌ Invalid Groq API Key or connection error: {e}")
+    st.stop()
+uploaded_file = st.file_uploader("📄 Upload a PDF file", type=["pdf"])
+if not uploaded_file:
+    st.info("Please upload a PDF file to begin.")
+    st.stop()
+if "processed" not in st.session_state:
+    with st.spinner("📚 Loading and splitting PDF..."):
+        pdf_path = os.path.join("temp.pdf")
+        with open(pdf_path, "wb") as f:
+            f.write(uploaded_file.read())
+        loader = PyPDFLoader(pdf_path)
+        documents = loader.load()
+        splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100)
+        texts = splitter.split_documents(documents)
+        embedding = OllamaEmbeddings(model="mxbai-embed-large")
+        vectorstore = Chroma.from_documents(documents=texts, embedding=embedding)
+        retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})
+        st.session_state["retriever"] = retriever
+        st.session_state["texts"] = texts
+        st.session_state["processed"] = True
+st.success(f"✅ Processed {len(st.session_state['texts'])} text chunks from your PDF.")
+system_prompt = (
+    "You are an intelligent question–answer generation assistant. "
+    "Your task is to read the provided text content (retrieved from a PDF document) "
+    "and create meaningful, diverse, and contextually accurate question–answer pairs.\n\n"
+    "Follow these rules strictly:\n"
+    "1. Generate clear and concise questions based only on the given text.\n"
+    "2. Each question must be answerable from the context — do not invent facts.\n"
+    "3. Write the corresponding answer immediately after each question.\n"
+    "4. Prefer factual, conceptual, or reasoning-based questions rather than trivial ones.\n"
+    "5. Output format must be clean and structured like this:\n\n"
+    "Q1: <question text>\n"
+    "A1: <answer text>\n\n"
+    "Q2: <question text>\n"
+    "A2: <answer text>\n\n"
+    "6. If the text contains multiple sections, cover all major ideas fairly.\n"
+    "7. Avoid repeating the same type of question; vary the question style (factual, analytical, summary, etc.).\n\n"
+    "Your output should only include the question–answer pairs. Do not add explanations or comments.\n\n"
+    "Here is the context:\n\n{context}"
+)
+prompt = ChatPromptTemplate.from_messages([
+    ("system", system_prompt),
+    ("user", "{question}")
+])
+llm = ChatGroq(model="llama-3.1-8b-instant",
+               api_key=groq_api_key, temperature=0.7)
+parser = StrOutputParser()
+def create_rag_chain(retriever, model, prompt):
+    def fetch_context(user_input):
+        docs = retriever.get_relevant_documents(user_input)
+        context = "\n\n".join([doc.page_content for doc in docs])
+        return {"context": context, "question": user_input}
+    chain = fetch_context | prompt | model | parser
+    return chain
+rag_chain = create_rag_chain(st.session_state["retriever"], llm, prompt)
+def parse_qa_pairs(model_output):
+    pattern = r"Q\d+:\s*(.*?)\nA\d+:\s*(.*?)(?=\nQ\d+:|\Z)"
+    matches = re.findall(pattern, model_output, re.DOTALL)
+    return [{"Question": q.strip(), "Answer": a.strip()} for q, a in matches]
+st.subheader("💬 Ask Questions from the PDF")
+user_question = st.text_input("Enter your question or request Q&A generation:")
+if "qa_data" not in st.session_state:
+    st.session_state.qa_data = []
+if st.button("Generate Answer") and user_question.strip():
+    with st.spinner("🤖 Generating answer..."):
+        rag_chain = create_rag_chain(st.session_state["retriever"], llm, prompt)
+        model_output = rag_chain.invoke({"question": user_question})
+        # Parse Q&A pairs
+        parsed_qa = parse_qa_pairs(model_output)
+        st.session_state.qa_data.extend(parsed_qa)
+        for i, item in enumerate(parsed_qa, start=1):
+            question = item.get("Question", "No Question Found")
+            answer = item.get("Answer", "No Answer Found")
+            st.markdown(f"**Q{i}:** {question}")
+            st.markdown(f"**A{i}:** {answer}")
+            st.markdown("---")  # separator between Q&A
+if st.session_state.qa_data:
+    df = pd.DataFrame(st.session_state.qa_data)
+    st.download_button(
+        label="📥 Download Q&A CSV",
+        data=df.to_csv(index=False).encode("utf-8"),
+        file_name="qa_results.csv",
+        mime="text/csv"
+    )