File size: 8,386 Bytes
3b270cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import os
import io
import re
from typing import List, Tuple

import streamlit as st
from transformers import pipeline
import PyPDF2
import nltk

@st.cache_resource
def download_nltk():
    nltk.download("punkt", quiet=True)

download_nltk()

with st.sidebar:
    st.image(
        "https://raw.githubusercontent.com/Runtimepirate/About_me/main/Profile_pic.jpg",
        width=200,
    )
    st.markdown(
        "## **Mr. Aditya Katariya [[Resume](https://drive.google.com/file/d/1Vq9-H1dl5Kky2ugXPIbnPvJ72EEkTROY/view?usp=drive_link)]**"
    )
    st.markdown(" *College - Noida Institute of Engineering and Technology, U.P*")
    st.markdown("----")
    st.markdown("## Contact Details:-")
    st.markdown("📫 *[Prasaritation@gmail.com](mailto:Prasaritation@gmail.com)*")
    st.markdown("💼 *[LinkedIn](https://www.linkedin.com/in/adityakatariya/)*")
    st.markdown("💻 *[GitHub](https://github.com/Runtimepirate)*")
    st.markdown("----")
    st.markdown("**AI & ML Enthusiast**")
    st.markdown(
        "Passionate about solving real-world problems using data science and customer analytics. Always learning and building smart, scalable AI solutions."
    )
    st.markdown("----")
    mode = st.radio("Choose Mode:", ["Ask Anything", "Challenge Me"], key="mode")

st.title("📚 Document‑Aware Assistant")

st.markdown(
    """
This assistant **reads your uploaded PDF or TXT document**, produces a *≤150‑word* summary, answers your questions with paragraph‑level justification, **generates logic‑based questions**, and evaluates your responses.
"""
)

@st.cache_resource(show_spinner=True)
def load_models():
    """Load all required Hugging Face pipelines once and reuse."""
    summarizer = pipeline(
        "summarization",
        model="facebook/bart-large-cnn",
        device_map="auto",
    )
    qa = pipeline(
        "question-answering",
        model="deepset/roberta-base-squad2",
        device_map="auto",
    )
    qg = pipeline(
        "text2text-generation",
        model="valhalla/t5-small-qg-hl",  
        device_map="auto",
        max_length=64,
    )
    return summarizer, qa, qg

summarizer, qa_pipeline, qg_pipeline = load_models()

def extract_text_from_pdf(uploaded_file: io.BytesIO) -> str:
    reader = PyPDF2.PdfReader(uploaded_file)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

def extract_text(uploaded_file) -> str:
    if uploaded_file.name.lower().endswith(".pdf"):
        return extract_text_from_pdf(uploaded_file)
    elif uploaded_file.name.lower().endswith(".txt"):
        return uploaded_file.read().decode("utf-8", errors="ignore")
    return ""

def chunk_text(text: str, max_tokens: int = 450) -> List[str]:
    """Split text into roughly max_tokens‑sized chunks using sentences."""
    sentences = nltk.sent_tokenize(text)
    chunks: List[str] = []
    current: List[str] = []
    token_count = 0

    for sent in sentences:
        num_tokens = len(sent.split())
        if token_count + num_tokens > max_tokens and current:
            chunks.append(" ".join(current))
            current = []
            token_count = 0
        current.append(sent)
        token_count += num_tokens
    if current:
        chunks.append(" ".join(current))
    return chunks

def get_best_answer(question: str, chunks: List[str]) -> Tuple[str, int, int, float, str]:
    """Run QA over chunks, return best answer with its score and context chunk."""
    best = {"score": -float("inf")}
    for chunk in chunks:
        try:
            answer = qa_pipeline(question=question, context=chunk)
            if answer["score"] > best["score"] and answer["answer"].strip():
                best = {
                    "answer": answer["answer"],
                    "score": answer["score"],
                    "start": answer["start"],
                    "end": answer["end"],
                    "context": chunk,
                }
        except Exception:
            continue
    if best["score"] == -float("inf"):
        return "", 0, 0, 0.0, ""
    return (
        best["answer"],
        best["start"],
        best["end"],
        best["score"],
        best["context"],
    )

def highlight_answer(context: str, start: int, end: int) -> str:
    """Return context with the answer wrapped in **bold** for display."""
    return (
        context[:start]
        + " **"
        + context[start:end]
        + "** "
        + context[end:]
    )

def generate_logic_questions(text: str, num_q: int = 3) -> List[str]:
    """Generate num_q questions from the document using QG pipeline."""
    sentences = nltk.sent_tokenize(text)
    questions: List[str] = []
    for sent in sentences:
        if len(questions) >= num_q:
            break
        hl_text = f"<hl> {sent} <hl> "
        try:
            q = qg_pipeline(hl_text, do_sample=False, max_length=64)[0]["generated_text"]
            q = q.strip().rstrip("?.!") + "?"
            if q not in questions:
                questions.append(q)
        except Exception:
            continue
    default_q = [
        "What is the main topic of the document?",
        "Summarize the methodology described.",
        "What are the key findings or conclusions?",
    ]
    while len(questions) < num_q:
        questions.append(default_q[len(questions)])
    return questions


    
os.environ["TOKENIZERS_PARALLELISM"] = "false"
uploaded = st.file_uploader("Upload PDF or TXT Document", type=["pdf", "txt"], key="uploader")

if uploaded:
    doc_text = extract_text(uploaded)
    st.session_state["doc_text"] = doc_text  

    st.subheader("🔎 Auto Summary (≤ 150 words)")
    try:
        summary = summarizer(
            doc_text[:4096],
            max_length=150,
            min_length=30,
            do_sample=False,
        )[0]["summary_text"]
        st.write(summary)
    except Exception as e:
        st.error(f"Summarization failed: {e}")

    if "chunks" not in st.session_state:
        st.session_state["chunks"] = chunk_text(doc_text)

    if mode == "Ask Anything":
        st.subheader("💬 Ask Anything")
        question = st.text_input("Ask a question about the document:", key="user_question")
        if st.button("Submit Question", key="submit_question") and question:
            with st.spinner("Finding answer..."):
                ans, start, end, score, context = get_best_answer(
                    question, st.session_state["chunks"]
                )
            if ans:
                st.markdown(f"**Answer:** {ans}")
                justification = highlight_answer(context, start, end)
                st.caption(f"Justification: …{justification[:300]}…")
                st.caption(
                    f"Confidence Score: {score:.3f}  |  Paragraph tokens: {len(context.split())}"
                )
            else:
                st.warning("Sorry, I couldn't find an answer in the document.")

    elif mode == "Challenge Me":
        st.subheader("🎯 Challenge Me")
        if "logic_questions" not in st.session_state:
            st.session_state["logic_questions"] = generate_logic_questions(doc_text)
            st.session_state["user_answers"] = ["" for _ in st.session_state["logic_questions"]]

        for idx, q in enumerate(st.session_state["logic_questions"]):
            st.text_input(f"Q{idx+1}: {q}", key=f"logic_q_{idx}")

        if st.button("Submit Answers", key="submit_logic"):
            st.markdown("----")
            for idx, q in enumerate(st.session_state["logic_questions"]):
                user_ans = st.session_state.get(f"logic_q_{idx}", "").strip()
                correct, start, end, score, context = get_best_answer(
                    q, st.session_state["chunks"]
                )
                st.markdown(f"**Q{idx+1} Evaluation:**")
                st.write(f"*Your Answer*: {user_ans or '—'}")
                st.write(f"*Expected Answer*: {correct or 'Not found in document'}")
                if correct:
                    justification = highlight_answer(context, start, end)
                    st.caption(f"Justification: …{justification[:300]}…")
                    st.caption(f"Confidence Score: {score:.3f}")
                st.markdown("----")

else:
    st.info("Please upload a PDF or TXT document to begin.")