import subprocess, sys
subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
import streamlit as st
import torch
import spacy
from transformers import T5ForConditionalGeneration, AutoTokenizer
# ── Page config ──────────────────────────────────────────────────────
st.set_page_config(
page_title="QuestionForge — AI Question Generator",
page_icon="⚡",
layout="wide",
initial_sidebar_state="collapsed",
)
# ── Global CSS ────────────────────────────────────────────────────────
st.markdown("""
""", unsafe_allow_html=True)
# ══════════════════════════════════════════════════════════════════════
# HERO
# ══════════════════════════════════════════════════════════════════════
st.markdown("""
T5 · SQuAD 1.1 · NLP · MPhil Data Science
Automatic Question
Generation
Paste any educational passage and generate exam-ready questions instantly using a fine-tuned T5 Transformer.
0.4509ROUGE-1
0.2445ROUGE-2
0.4167ROUGE-L
0.1581BLEU
60MParameters
""", unsafe_allow_html=True)
# ══════════════════════════════════════════════════════════════════════
# MODEL LOADING
# ══════════════════════════════════════════════════════════════════════
@st.cache_resource(show_spinner=False)
def load_model():
MODEL_ID = "Hamzasajjad38/t5-small-qg"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
model = T5ForConditionalGeneration.from_pretrained(MODEL_ID)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
nlp = spacy.load("en_core_web_sm")
return tokenizer, model, device, nlp
with st.spinner("⚡ Loading T5 model from Hugging Face Hub…"):
tokenizer, model, device, nlp = load_model()
# ══════════════════════════════════════════════════════════════════════
# SAMPLE PASSAGES
# ══════════════════════════════════════════════════════════════════════
SAMPLES = {
"— paste your own —": "",
"🌿 Biology — Photosynthesis": (
"Photosynthesis is a process used by plants and other organisms to convert light energy into "
"chemical energy stored in glucose. This process takes place inside chloroplasts, where the "
"green pigment chlorophyll absorbs sunlight. Carbon dioxide from the air and water from the "
"soil are the main raw materials used during photosynthesis. Oxygen is released as a byproduct "
"of this process, which is essential for most life on Earth."
),
"⚙️ History — Industrial Revolution": (
"The Industrial Revolution began in Britain in the late 18th century and transformed "
"manufacturing, agriculture, and transportation. The invention of the steam engine by James "
"Watt in 1769 played a pivotal role, enabling factories to operate machinery and railroads to "
"carry goods across the country. Workers migrated from rural areas to cities, leading to rapid "
"urbanization and significant social changes across Europe and North America."
),
"🤖 AI — Machine Learning": (
"Machine learning is a subset of artificial intelligence that enables systems to learn from "
"data and improve their performance without being explicitly programmed. Supervised learning, "
"unsupervised learning, and reinforcement learning are the three main paradigms. Neural networks "
"form the foundation of deep learning, revolutionizing fields like computer vision, speech "
"recognition, and natural language processing. Geoffrey Hinton, Yann LeCun, and Yoshua Bengio "
"are widely regarded as the pioneers of deep learning."
),
"🌍 Geography — Amazon River": (
"The Amazon River in South America is the largest river in the world by water discharge, "
"carrying about 20 percent of all freshwater that flows into the oceans. It flows through "
"Brazil, Peru, and Colombia, spanning over 6,400 kilometres. Its basin hosts the world's "
"largest tropical rainforest, home to millions of plant and animal species. The Amazon plays "
"a critical role in regulating Earth's climate by absorbing vast amounts of carbon dioxide."
),
"🔬 Physics — Relativity": (
"Albert Einstein published his special theory of relativity in 1905 while working as a patent "
"clerk in Bern, Switzerland. The theory introduced the famous equation E = mc², demonstrating "
"that mass and energy are interchangeable. In 1915, Einstein extended this with the general "
"theory of relativity, which described gravity as the curvature of spacetime caused by mass. "
"In 1921, he received the Nobel Prize in Physics for the discovery of the law of the "
"photoelectric effect."
),
}
# ══════════════════════════════════════════════════════════════════════
# LAYOUT — two columns
# ══════════════════════════════════════════════════════════════════════
col_left, gap_col, col_right = st.columns([11, 1, 12])
# ── LEFT — INPUT ─────────────────────────────────────────────────────
with col_left:
st.markdown('01 — Passage
', unsafe_allow_html=True)
sample_key = st.selectbox(
"Load a sample",
list(SAMPLES.keys()),
label_visibility="collapsed",
)
default_text = SAMPLES[sample_key]
passage = st.text_area(
"passage",
value=default_text,
height=240,
placeholder="Paste any educational paragraph here…",
label_visibility="collapsed",
)
st.markdown('02 — Settings
', unsafe_allow_html=True)
num_q = st.slider(
"Number of questions",
min_value=1, max_value=10, value=5,
help="How many questions to generate (1 – 10)"
)
st.markdown("
", unsafe_allow_html=True)
gen_clicked = st.button("⚡ Generate Questions")
# ══════════════════════════════════════════════════════════════════════
# GENERATION LOGIC
# ══════════════════════════════════════════════════════════════════════
def generate_questions(passage: str, num_q: int):
"""
Extract candidate answer spans (NER + noun chunks),
then run them through the fine-tuned T5 model.
Returns a list of question strings (no answers).
"""
doc = nlp(passage)
# Priority 1: Named entities
candidates = [e.text.strip() for e in doc.ents
if len(e.text.strip().split()) <= 5 and len(e.text.strip()) > 1]
# Priority 2: Noun chunks (fallback / top-up)
if len(candidates) < num_q:
for chunk in doc.noun_chunks:
c = chunk.text.strip()
if 1 < len(c.split()) <= 5 and c not in candidates:
candidates.append(c)
# De-duplicate while preserving order
seen, unique = set(), []
for c in candidates:
key = c.lower()
if key not in seen:
seen.add(key)
unique.append(c)
candidates = unique[:num_q]
if not candidates:
return []
questions = []
seen_q = set()
for answer in candidates:
# Build highlighted input
highlighted = passage.replace(answer, f" {answer} ", 1)
inp = tokenizer(
f"generate question: {highlighted}",
return_tensors="pt", max_length=512, truncation=True
).to(device)
out = model.generate(
**inp,
max_new_tokens=64,
num_beams=4,
early_stopping=True,
no_repeat_ngram_size=3,
)
q = tokenizer.decode(out[0], skip_special_tokens=True).strip()
# Deduplicate questions
if q.lower() not in seen_q:
seen_q.add(q.lower())
questions.append(q)
return questions
# ── RIGHT — OUTPUT ────────────────────────────────────────────────────
with col_right:
st.markdown('03 — Generated Questions
', unsafe_allow_html=True)
if gen_clicked:
if not passage or len(passage.strip()) < 40:
st.warning("Please enter a more detailed passage (at least 40 characters) for best results.")
else:
with st.spinner("Generating questions with T5…"):
questions = generate_questions(passage.strip(), num_q)
if not questions:
st.error("Could not extract answer candidates. Try a more detailed or factual passage.")
else:
# ── Result header ──────────────────────────────────
st.markdown(f"""
""", unsafe_allow_html=True)
# ── Question cards (no answers) ────────────────────
download_text = ""
for i, q in enumerate(questions, 1):
st.markdown(f"""
""", unsafe_allow_html=True)
download_text += f"Q{i}. {q}\n\n"
# ── Download ───────────────────────────────────────
st.markdown("
", unsafe_allow_html=True)
st.download_button(
label="📄 Download questions (.txt)",
data=download_text.strip(),
file_name="questions.txt",
mime="text/plain",
)
else:
# Empty state
st.markdown("""
✦
Questions appear here
Choose a sample or paste your passage,
set the number of questions, then click generate.
""", unsafe_allow_html=True)
# ══════════════════════════════════════════════════════════════════════
# FOOTER
# ══════════════════════════════════════════════════════════════════════
st.markdown("""
MPhil Data Science · NLP Course Project ·
M. Hamza Sajjad & Shanza Gul ·
Supervised by Dr. Adnan Abid
Model: Hamzasajjad38/t5-small-qg
· Dataset:
SQuAD 1.1
""", unsafe_allow_html=True)