RuntimePirate's picture
Create app.py
3b270cd verified
import os
import io
import re
from typing import List, Tuple
import streamlit as st
from transformers import pipeline
import PyPDF2
import nltk
@st.cache_resource
def download_nltk():
nltk.download("punkt", quiet=True)
download_nltk()
with st.sidebar:
st.image(
"https://raw.githubusercontent.com/Runtimepirate/About_me/main/Profile_pic.jpg",
width=200,
)
st.markdown(
"## **Mr. Aditya Katariya [[Resume](https://drive.google.com/file/d/1Vq9-H1dl5Kky2ugXPIbnPvJ72EEkTROY/view?usp=drive_link)]**"
)
st.markdown(" *College - Noida Institute of Engineering and Technology, U.P*")
st.markdown("----")
st.markdown("## Contact Details:-")
st.markdown("📫 *[Prasaritation@gmail.com](mailto:Prasaritation@gmail.com)*")
st.markdown("💼 *[LinkedIn](https://www.linkedin.com/in/adityakatariya/)*")
st.markdown("💻 *[GitHub](https://github.com/Runtimepirate)*")
st.markdown("----")
st.markdown("**AI & ML Enthusiast**")
st.markdown(
"Passionate about solving real-world problems using data science and customer analytics. Always learning and building smart, scalable AI solutions."
)
st.markdown("----")
mode = st.radio("Choose Mode:", ["Ask Anything", "Challenge Me"], key="mode")
st.title("📚 Document‑Aware Assistant")
st.markdown(
"""
This assistant **reads your uploaded PDF or TXT document**, produces a *≤150‑word* summary, answers your questions with paragraph‑level justification, **generates logic‑based questions**, and evaluates your responses.
"""
)
@st.cache_resource(show_spinner=True)
def load_models():
"""Load all required Hugging Face pipelines once and reuse."""
summarizer = pipeline(
"summarization",
model="facebook/bart-large-cnn",
device_map="auto",
)
qa = pipeline(
"question-answering",
model="deepset/roberta-base-squad2",
device_map="auto",
)
qg = pipeline(
"text2text-generation",
model="valhalla/t5-small-qg-hl",
device_map="auto",
max_length=64,
)
return summarizer, qa, qg
summarizer, qa_pipeline, qg_pipeline = load_models()
def extract_text_from_pdf(uploaded_file: io.BytesIO) -> str:
reader = PyPDF2.PdfReader(uploaded_file)
text = ""
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text
def extract_text(uploaded_file) -> str:
if uploaded_file.name.lower().endswith(".pdf"):
return extract_text_from_pdf(uploaded_file)
elif uploaded_file.name.lower().endswith(".txt"):
return uploaded_file.read().decode("utf-8", errors="ignore")
return ""
def chunk_text(text: str, max_tokens: int = 450) -> List[str]:
"""Split text into roughly max_tokens‑sized chunks using sentences."""
sentences = nltk.sent_tokenize(text)
chunks: List[str] = []
current: List[str] = []
token_count = 0
for sent in sentences:
num_tokens = len(sent.split())
if token_count + num_tokens > max_tokens and current:
chunks.append(" ".join(current))
current = []
token_count = 0
current.append(sent)
token_count += num_tokens
if current:
chunks.append(" ".join(current))
return chunks
def get_best_answer(question: str, chunks: List[str]) -> Tuple[str, int, int, float, str]:
"""Run QA over chunks, return best answer with its score and context chunk."""
best = {"score": -float("inf")}
for chunk in chunks:
try:
answer = qa_pipeline(question=question, context=chunk)
if answer["score"] > best["score"] and answer["answer"].strip():
best = {
"answer": answer["answer"],
"score": answer["score"],
"start": answer["start"],
"end": answer["end"],
"context": chunk,
}
except Exception:
continue
if best["score"] == -float("inf"):
return "", 0, 0, 0.0, ""
return (
best["answer"],
best["start"],
best["end"],
best["score"],
best["context"],
)
def highlight_answer(context: str, start: int, end: int) -> str:
"""Return context with the answer wrapped in **bold** for display."""
return (
context[:start]
+ " **"
+ context[start:end]
+ "** "
+ context[end:]
)
def generate_logic_questions(text: str, num_q: int = 3) -> List[str]:
"""Generate num_q questions from the document using QG pipeline."""
sentences = nltk.sent_tokenize(text)
questions: List[str] = []
for sent in sentences:
if len(questions) >= num_q:
break
hl_text = f"<hl> {sent} <hl> "
try:
q = qg_pipeline(hl_text, do_sample=False, max_length=64)[0]["generated_text"]
q = q.strip().rstrip("?.!") + "?"
if q not in questions:
questions.append(q)
except Exception:
continue
default_q = [
"What is the main topic of the document?",
"Summarize the methodology described.",
"What are the key findings or conclusions?",
]
while len(questions) < num_q:
questions.append(default_q[len(questions)])
return questions
os.environ["TOKENIZERS_PARALLELISM"] = "false"
uploaded = st.file_uploader("Upload PDF or TXT Document", type=["pdf", "txt"], key="uploader")
if uploaded:
doc_text = extract_text(uploaded)
st.session_state["doc_text"] = doc_text
st.subheader("🔎 Auto Summary (≤ 150 words)")
try:
summary = summarizer(
doc_text[:4096],
max_length=150,
min_length=30,
do_sample=False,
)[0]["summary_text"]
st.write(summary)
except Exception as e:
st.error(f"Summarization failed: {e}")
if "chunks" not in st.session_state:
st.session_state["chunks"] = chunk_text(doc_text)
if mode == "Ask Anything":
st.subheader("💬 Ask Anything")
question = st.text_input("Ask a question about the document:", key="user_question")
if st.button("Submit Question", key="submit_question") and question:
with st.spinner("Finding answer..."):
ans, start, end, score, context = get_best_answer(
question, st.session_state["chunks"]
)
if ans:
st.markdown(f"**Answer:** {ans}")
justification = highlight_answer(context, start, end)
st.caption(f"Justification: …{justification[:300]}…")
st.caption(
f"Confidence Score: {score:.3f} | Paragraph tokens: {len(context.split())}"
)
else:
st.warning("Sorry, I couldn't find an answer in the document.")
elif mode == "Challenge Me":
st.subheader("🎯 Challenge Me")
if "logic_questions" not in st.session_state:
st.session_state["logic_questions"] = generate_logic_questions(doc_text)
st.session_state["user_answers"] = ["" for _ in st.session_state["logic_questions"]]
for idx, q in enumerate(st.session_state["logic_questions"]):
st.text_input(f"Q{idx+1}: {q}", key=f"logic_q_{idx}")
if st.button("Submit Answers", key="submit_logic"):
st.markdown("----")
for idx, q in enumerate(st.session_state["logic_questions"]):
user_ans = st.session_state.get(f"logic_q_{idx}", "").strip()
correct, start, end, score, context = get_best_answer(
q, st.session_state["chunks"]
)
st.markdown(f"**Q{idx+1} Evaluation:**")
st.write(f"*Your Answer*: {user_ans or '—'}")
st.write(f"*Expected Answer*: {correct or 'Not found in document'}")
if correct:
justification = highlight_answer(context, start, end)
st.caption(f"Justification: …{justification[:300]}…")
st.caption(f"Confidence Score: {score:.3f}")
st.markdown("----")
else:
st.info("Please upload a PDF or TXT document to begin.")